1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * sun4v CPU DR Module
29 */
30
31#include <sys/modctl.h>
32#include <sys/processor.h>
33#include <sys/cpuvar.h>
34#include <sys/cpupart.h>
35#include <sys/sunddi.h>
36#include <sys/sunndi.h>
37#include <sys/note.h>
38#include <sys/sysevent/dr.h>
39#include <sys/hypervisor_api.h>
40#include <sys/mach_descrip.h>
41#include <sys/mdesc.h>
42#include <sys/ds.h>
43#include <sys/drctl.h>
44#include <sys/dr_util.h>
45#include <sys/dr_cpu.h>
46#include <sys/promif.h>
47#include <sys/machsystm.h>
48
49
50static struct modlmisc modlmisc = {
51	&mod_miscops,
52	"sun4v CPU DR"
53};
54
55static struct modlinkage modlinkage = {
56	MODREV_1,
57	(void *)&modlmisc,
58	NULL
59};
60
61typedef int (*fn_t)(processorid_t, int *, boolean_t);
62
63/*
64 * Global DS Handle
65 */
66static ds_svc_hdl_t ds_handle;
67
68/*
69 * Supported DS Capability Versions
70 */
71static ds_ver_t		dr_cpu_vers[] = { { 1, 1 }, { 1, 0 } };
72#define	DR_CPU_NVERS	(sizeof (dr_cpu_vers) / sizeof (dr_cpu_vers[0]))
73
74static ds_ver_t		version;
75
76/*
77 * DS Capability Description
78 */
79static ds_capability_t dr_cpu_cap = {
80	DR_CPU_DS_ID,		/* svc_id */
81	dr_cpu_vers,		/* vers */
82	DR_CPU_NVERS		/* nvers */
83};
84
85#define	DRCPU_VERS_EQ(_maj, _min) \
86	((version.major == (_maj)) && (version.minor == (_min)))
87
88#define	DRCPU_VERS_GTEQ(_maj, _min) \
89	((version.major > (_maj)) ||					\
90	((version.major == (_maj)) && (version.minor >= (_min))))
91
92/*
93 * DS Callbacks
94 */
95static void dr_cpu_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
96static void dr_cpu_unreg_handler(ds_cb_arg_t arg);
97static void dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
98
99/*
100 * DS Client Ops Vector
101 */
102static ds_clnt_ops_t dr_cpu_ops = {
103	dr_cpu_reg_handler,	/* ds_reg_cb */
104	dr_cpu_unreg_handler,	/* ds_unreg_cb */
105	dr_cpu_data_handler,	/* ds_data_cb */
106	NULL			/* cb_arg */
107};
108
109/*
110 * Operation Results
111 *
112 * Used internally to gather results while an operation on a
113 * list of CPUs is in progress. In particular, it is used to
114 * keep track of which CPUs have already failed so that they are
115 * not processed further, and the manner in which they failed.
116 */
117typedef struct {
118	uint32_t	cpuid;
119	uint32_t	result;
120	uint32_t	status;
121	char		*string;
122} dr_cpu_res_t;
123
124#define	DR_CPU_MAX_ERR_LEN	64	/* maximum error string length */
125
126/*
127 * Internal Functions
128 */
129static int dr_cpu_init(void);
130static int dr_cpu_fini(void);
131
132static int dr_cpu_list_wrk(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
133static int dr_cpu_list_status(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
134
135static int dr_cpu_unconfigure(processorid_t, int *status, boolean_t force);
136static int dr_cpu_configure(processorid_t, int *status, boolean_t force);
137static int dr_cpu_status(processorid_t, int *status);
138
139static void dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res);
140static void dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres);
141static int dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res);
142
143static dr_cpu_res_t *dr_cpu_res_array_init(dr_cpu_hdr_t *, drctl_rsrc_t *, int);
144static void dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres);
145static size_t dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res,
146    dr_cpu_hdr_t **respp);
147
148static int dr_cpu_probe(processorid_t newcpuid);
149static int dr_cpu_deprobe(processorid_t cpuid);
150
151static dev_info_t *dr_cpu_find_node(processorid_t cpuid);
152static mde_cookie_t dr_cpu_find_node_md(processorid_t, md_t *, mde_cookie_t *);
153
154int
155_init(void)
156{
157	int	status;
158
159	/* check that CPU DR is enabled */
160	if (dr_is_disabled(DR_TYPE_CPU)) {
161		cmn_err(CE_CONT, "!CPU DR is disabled\n");
162		return (-1);
163	}
164
165	if ((status = dr_cpu_init()) != 0) {
166		cmn_err(CE_NOTE, "CPU DR initialization failed");
167		return (status);
168	}
169
170	if ((status = mod_install(&modlinkage)) != 0) {
171		(void) dr_cpu_fini();
172	}
173
174	return (status);
175}
176
177int
178_info(struct modinfo *modinfop)
179{
180	return (mod_info(&modlinkage, modinfop));
181}
182
183int dr_cpu_allow_unload;
184
185int
186_fini(void)
187{
188	int	status;
189
190	if (dr_cpu_allow_unload == 0)
191		return (EBUSY);
192
193	if ((status = mod_remove(&modlinkage)) == 0) {
194		(void) dr_cpu_fini();
195	}
196
197	return (status);
198}
199
200static int
201dr_cpu_init(void)
202{
203	int	rv;
204
205	if ((rv = ds_cap_init(&dr_cpu_cap, &dr_cpu_ops)) != 0) {
206		cmn_err(CE_NOTE, "ds_cap_init failed: %d", rv);
207		return (-1);
208	}
209
210	return (0);
211}
212
213static int
214dr_cpu_fini(void)
215{
216	int	rv;
217
218	if ((rv = ds_cap_fini(&dr_cpu_cap)) != 0) {
219		cmn_err(CE_NOTE, "ds_cap_fini failed: %d", rv);
220		return (-1);
221	}
222
223	return (0);
224}
225
226static void
227dr_cpu_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
228{
229	DR_DBG_CPU("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
230	    ver->major, ver->minor, hdl);
231
232	version.major = ver->major;
233	version.minor = ver->minor;
234	ds_handle = hdl;
235}
236
237static void
238dr_cpu_unreg_handler(ds_cb_arg_t arg)
239{
240	DR_DBG_CPU("unreg_handler: arg=0x%p\n", arg);
241
242	ds_handle = DS_INVALID_HDL;
243}
244
245static void
246dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
247{
248	_NOTE(ARGUNUSED(arg))
249
250	dr_cpu_hdr_t	*req = buf;
251	dr_cpu_hdr_t	err_resp;
252	dr_cpu_hdr_t	*resp = &err_resp;
253	int		resp_len = 0;
254	int		rv;
255
256	/*
257	 * Sanity check the message
258	 */
259	if (buflen < sizeof (dr_cpu_hdr_t)) {
260		DR_DBG_CPU("incoming message short: expected at least %ld "
261		    "bytes, received %ld\n", sizeof (dr_cpu_hdr_t), buflen);
262		goto done;
263	}
264
265	if (req == NULL) {
266		DR_DBG_CPU("empty message: expected at least %ld bytes\n",
267		    sizeof (dr_cpu_hdr_t));
268		goto done;
269	}
270
271	DR_DBG_CPU("incoming request:\n");
272	DR_DBG_DUMP_MSG(buf, buflen);
273
274	if (req->num_records > NCPU) {
275		DR_DBG_CPU("CPU list too long: %d when %d is the maximum\n",
276		    req->num_records, NCPU);
277		goto done;
278	}
279
280	if (req->num_records == 0) {
281		DR_DBG_CPU("No CPU specified for operation\n");
282		goto done;
283	}
284
285	/*
286	 * Process the command
287	 */
288	switch (req->msg_type) {
289	case DR_CPU_CONFIGURE:
290	case DR_CPU_UNCONFIGURE:
291	case DR_CPU_FORCE_UNCONFIG:
292		if ((rv = dr_cpu_list_wrk(req, &resp, &resp_len)) != 0) {
293			DR_DBG_CPU("%s%s failed (%d)\n",
294			    (req->msg_type == DR_CPU_CONFIGURE) ?
295			    "CPU configure" : "CPU unconfigure",
296			    (req->msg_type == DR_CPU_FORCE_UNCONFIG) ?
297			    " (forced)" : "", rv);
298		}
299		break;
300
301	case DR_CPU_STATUS:
302		if ((rv = dr_cpu_list_status(req, &resp, &resp_len)) != 0)
303			DR_DBG_CPU("CPU status failed (%d)\n", rv);
304		break;
305
306	default:
307		cmn_err(CE_NOTE, "unsupported DR operation (%d)",
308		    req->msg_type);
309		break;
310	}
311
312done:
313	/* check if an error occurred */
314	if (resp == &err_resp) {
315		resp->req_num = (req) ? req->req_num : 0;
316		resp->msg_type = DR_CPU_ERROR;
317		resp->num_records = 0;
318		resp_len = sizeof (dr_cpu_hdr_t);
319	}
320
321	DR_DBG_CPU("outgoing response:\n");
322	DR_DBG_DUMP_MSG(resp, resp_len);
323
324	/* send back the response */
325	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
326		DR_DBG_CPU("ds_send failed\n");
327	}
328
329	/* free any allocated memory */
330	if (DRCPU_VERS_GTEQ(1, 1) || (resp != &err_resp)) {
331		DR_DBG_KMEM("%s: free addr %p size %d\n",
332		    __func__, (void *)resp, resp_len);
333		kmem_free(resp, resp_len);
334	}
335}
336
337/*
338 * Create a response message which consists of a header followed
339 * by the error string passed in.
340 */
341static size_t
342dr_cpu_err_resp(dr_cpu_hdr_t *req, dr_cpu_hdr_t **respp, char *msg)
343{
344	size_t size;
345	dr_cpu_hdr_t *resp;
346
347	ASSERT((msg != NULL) && (strlen(msg) > 0));
348
349	size = sizeof (*req) + strlen(msg) + 1;
350	resp = kmem_alloc(size, KM_SLEEP);
351	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
352	    __func__, (void *)resp, size);
353
354	resp->req_num = req->req_num;
355	resp->msg_type = DR_CPU_ERROR;
356	resp->num_records = 0;
357
358	(void) strcpy((char *)(resp) + sizeof (*resp), msg);
359
360	*respp = resp;
361
362	return (size);
363}
364
365/*
366 * Common routine to config or unconfig multiple cpus.  The unconfig
367 * case checks with the OS to see if the removal of cpus will be
368 * permitted, but can be overridden by the "force" version of the
369 * command.  Otherwise, the logic for both cases is identical.
370 *
371 * Note: Do not modify result buffer or length on error.
372 */
373static int
374dr_cpu_list_wrk(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
375{
376	int		rv;
377	int		idx;
378	int		count;
379	fn_t		dr_fn;
380	int		se_hint;
381	boolean_t	force = B_FALSE;
382	uint32_t	*req_cpus;
383	dr_cpu_res_t	*res;
384	int		drctl_cmd;
385	int		drctl_flags = 0;
386	drctl_rsrc_t	*drctl_req;
387	size_t		drctl_req_len;
388	drctl_resp_t	*drctl_resp;
389	drctl_rsrc_t	*drctl_rsrc;
390	size_t		drctl_resp_len = 0;
391	drctl_cookie_t	drctl_res_ck;
392
393	ASSERT((req != NULL) && (req->num_records != 0));
394
395	count = req->num_records;
396
397	/*
398	 * Extract all information that is specific
399	 * to the various types of operations.
400	 */
401	switch (req->msg_type) {
402	case DR_CPU_CONFIGURE:
403		dr_fn = dr_cpu_configure;
404		drctl_cmd = DRCTL_CPU_CONFIG_REQUEST;
405		se_hint = SE_HINT_INSERT;
406		break;
407	case DR_CPU_FORCE_UNCONFIG:
408		drctl_flags = DRCTL_FLAG_FORCE;
409		force = B_TRUE;
410		_NOTE(FALLTHROUGH)
411	case DR_CPU_UNCONFIGURE:
412		dr_fn = dr_cpu_unconfigure;
413		drctl_cmd = DRCTL_CPU_UNCONFIG_REQUEST;
414		se_hint = SE_HINT_REMOVE;
415		break;
416	default:
417		/* Programming error if we reach this. */
418		cmn_err(CE_NOTE,
419		    "%s: bad msg_type %d\n", __func__, req->msg_type);
420		ASSERT(0);
421		return (-1);
422	}
423
424	/* the incoming array of cpuids to operate on */
425	req_cpus = DR_CPU_CMD_CPUIDS(req);
426
427	/* allocate drctl request msg based on incoming resource count */
428	drctl_req_len = sizeof (drctl_rsrc_t) * count;
429	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
430	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
431	    __func__, (void *)drctl_req, drctl_req_len);
432
433	/* copy the cpuids for the drctl call from the incoming request msg */
434	for (idx = 0; idx < count; idx++)
435		drctl_req[idx].res_cpu_id = req_cpus[idx];
436
437	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
438	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
439
440	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
441
442	if (rv != 0) {
443		DR_DBG_CPU("%s: drctl_config_init "
444		    "returned: %d\n", __func__, rv);
445
446		if (DRCPU_VERS_EQ(1, 0)) {
447			rv = -1;
448		} else {
449			ASSERT(DRCPU_VERS_GTEQ(1, 1));
450			ASSERT(drctl_resp->resp_type == DRCTL_RESP_ERR);
451
452			*resp_len = dr_cpu_err_resp(req,
453			    resp, drctl_resp->resp_err_msg);
454		}
455
456		DR_DBG_KMEM("%s: free addr %p size %ld\n",
457		    __func__, (void *)drctl_resp, drctl_resp_len);
458		kmem_free(drctl_resp, drctl_resp_len);
459		DR_DBG_KMEM("%s: free addr %p size %ld\n",
460		    __func__, (void *)drctl_req, drctl_req_len);
461		kmem_free(drctl_req, drctl_req_len);
462
463		return (rv);
464	}
465
466	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
467
468	drctl_rsrc = drctl_resp->resp_resources;
469
470	/* create the result scratch array */
471	res = dr_cpu_res_array_init(req, drctl_rsrc, count);
472
473	/*
474	 * For unconfigure, check if there are any conditions
475	 * that will cause the operation to fail. These are
476	 * performed before the actual unconfigure attempt so
477	 * that a meaningful error message can be generated.
478	 */
479	if (req->msg_type != DR_CPU_CONFIGURE)
480		dr_cpu_check_cpus(req, res);
481
482	/* perform the specified operation on each of the CPUs */
483	for (idx = 0; idx < count; idx++) {
484		int result;
485		int status;
486
487		/*
488		 * If no action will be taken against the current
489		 * CPU, update the drctl resource information to
490		 * ensure that it gets recovered properly during
491		 * the drctl fini() call.
492		 */
493		if (res[idx].result != DR_CPU_RES_OK) {
494			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
495			continue;
496		}
497
498		/* call the function to perform the actual operation */
499		result = (*dr_fn)(req_cpus[idx], &status, force);
500
501		/* save off results of the operation */
502		res[idx].result = result;
503		res[idx].status = status;
504
505		/* save result for drctl fini() reusing init() msg memory */
506		drctl_req[idx].status = (result != DR_CPU_RES_OK) ?
507		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
508
509		DR_DBG_CPU("%s: cpuid %d status %d result %d off '%s'\n",
510		    __func__, req_cpus[idx], drctl_req[idx].status, result,
511		    (res[idx].string) ? res[idx].string : "");
512	}
513
514	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
515		DR_DBG_CPU("%s: drctl_config_fini "
516		    "returned: %d\n", __func__, rv);
517
518	/*
519	 * Operation completed without any fatal errors.
520	 * Pack the response for transmission.
521	 */
522	*resp_len = dr_cpu_pack_response(req, res, resp);
523
524	/* notify interested parties about the operation */
525	dr_generate_event(DR_TYPE_CPU, se_hint);
526
527	/*
528	 * Deallocate any scratch memory.
529	 */
530	DR_DBG_KMEM("%s: free addr %p size %ld\n",
531	    __func__, (void *)drctl_resp, drctl_resp_len);
532	kmem_free(drctl_resp, drctl_resp_len);
533	DR_DBG_KMEM("%s: free addr %p size %ld\n",
534	    __func__, (void *)drctl_req, drctl_req_len);
535	kmem_free(drctl_req, drctl_req_len);
536
537	dr_cpu_res_array_fini(res, count);
538
539	return (0);
540}
541
542/*
543 * Allocate and initialize a result array based on the initial
544 * drctl operation. A valid result array is always returned.
545 */
546static dr_cpu_res_t *
547dr_cpu_res_array_init(dr_cpu_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
548{
549	int		idx;
550	dr_cpu_res_t	*res;
551	char		*err_str;
552	size_t		err_len;
553
554	/* allocate zero filled buffer to initialize fields */
555	res = kmem_zalloc(nrsrc * sizeof (dr_cpu_res_t), KM_SLEEP);
556	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
557	    __func__, (void *)res, nrsrc * sizeof (dr_cpu_res_t));
558
559	/*
560	 * Fill in the result information for each resource.
561	 */
562	for (idx = 0; idx < nrsrc; idx++) {
563		res[idx].cpuid = rsrc[idx].res_cpu_id;
564		res[idx].result = DR_CPU_RES_OK;
565
566		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
567			continue;
568
569		/*
570		 * Update the state information for this CPU.
571		 */
572		res[idx].result = DR_CPU_RES_BLOCKED;
573		res[idx].status = (req->msg_type == DR_CPU_CONFIGURE) ?
574		    DR_CPU_STAT_UNCONFIGURED : DR_CPU_STAT_CONFIGURED;
575
576		/*
577		 * If an error string exists, copy it out of the
578		 * message buffer. This eliminates any dependency
579		 * on the memory allocated for the message buffer
580		 * itself.
581		 */
582		if (rsrc[idx].offset != NULL) {
583			err_str = (char *)rsrc + rsrc[idx].offset;
584			err_len = strlen(err_str) + 1;
585
586			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
587			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
588			    __func__, (void *)(res[idx].string), err_len);
589			bcopy(err_str, res[idx].string, err_len);
590		}
591	}
592
593	return (res);
594}
595
596static void
597dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres)
598{
599	int	idx;
600	size_t	str_len;
601
602	for (idx = 0; idx < nres; idx++) {
603		/* deallocate the error string if present */
604		if (res[idx].string) {
605			str_len = strlen(res[idx].string) + 1;
606			DR_DBG_KMEM("%s: free addr %p size %ld\n",
607			    __func__, (void *)(res[idx].string), str_len);
608			kmem_free(res[idx].string, str_len);
609		}
610	}
611
612	/* deallocate the result array itself */
613	DR_DBG_KMEM("%s: free addr %p size %ld\n",
614	    __func__, (void *)res, sizeof (dr_cpu_res_t) * nres);
615	kmem_free(res, sizeof (dr_cpu_res_t) * nres);
616}
617
618/*
619 * Allocate and pack a response message for transmission based
620 * on the specified result array. A valid response message and
621 * valid size information is always returned.
622 */
623static size_t
624dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res, dr_cpu_hdr_t **respp)
625{
626	int		idx;
627	dr_cpu_hdr_t	*resp;
628	dr_cpu_stat_t	*resp_stat;
629	size_t		resp_len;
630	uint32_t	curr_off;
631	caddr_t		curr_str;
632	size_t		str_len;
633	size_t		stat_len;
634	int		nstat = req->num_records;
635
636	/*
637	 * Calculate the size of the response message
638	 * and allocate an appropriately sized buffer.
639	 */
640	resp_len = 0;
641
642	/* add the header size */
643	resp_len += sizeof (dr_cpu_hdr_t);
644
645	/* add the stat array size */
646	stat_len = sizeof (dr_cpu_stat_t) * nstat;
647	resp_len += stat_len;
648
649	/* add the size of any error strings */
650	for (idx = 0; idx < nstat; idx++) {
651		if (res[idx].string != NULL) {
652			resp_len += strlen(res[idx].string) + 1;
653		}
654	}
655
656	/* allocate the message buffer */
657	resp = kmem_zalloc(resp_len, KM_SLEEP);
658	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
659	    __func__, (void *)resp, resp_len);
660
661	/*
662	 * Fill in the header information.
663	 */
664	resp->req_num = req->req_num;
665	resp->msg_type = DR_CPU_OK;
666	resp->num_records = nstat;
667
668	/*
669	 * Fill in the stat information.
670	 */
671	resp_stat = DR_CPU_RESP_STATS(resp);
672
673	/* string offsets start immediately after stat array */
674	curr_off = sizeof (dr_cpu_hdr_t) + stat_len;
675	curr_str = (char *)resp_stat + stat_len;
676
677	for (idx = 0; idx < nstat; idx++) {
678		resp_stat[idx].cpuid = res[idx].cpuid;
679		resp_stat[idx].result = res[idx].result;
680		resp_stat[idx].status = res[idx].status;
681
682		if (res[idx].string != NULL) {
683			/* copy over the error string */
684			str_len = strlen(res[idx].string) + 1;
685			bcopy(res[idx].string, curr_str, str_len);
686			resp_stat[idx].string_off = curr_off;
687
688			curr_off += str_len;
689			curr_str += str_len;
690		}
691	}
692
693	/* buffer should be exactly filled */
694	ASSERT(curr_off == resp_len);
695
696	*respp = resp;
697	return (resp_len);
698}
699
700/*
701 * Check for conditions that will prevent a CPU from being offlined.
702 * This provides the opportunity to generate useful information to
703 * help diagnose the failure rather than letting the offline attempt
704 * fail in a more generic way.
705 */
706static void
707dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res)
708{
709	int		idx;
710	cpu_t		*cp;
711	uint32_t	*cpuids;
712
713	ASSERT((req->msg_type == DR_CPU_UNCONFIGURE) ||
714	    (req->msg_type == DR_CPU_FORCE_UNCONFIG));
715
716	DR_DBG_CPU("dr_cpu_check_cpus...\n");
717
718	/* array of cpuids start just after the header */
719	cpuids = DR_CPU_CMD_CPUIDS(req);
720
721	mutex_enter(&cpu_lock);
722
723	/*
724	 * Always check processor set membership first. The
725	 * last CPU in a processor set will fail to offline
726	 * even if the operation if forced, so any failures
727	 * should always be reported.
728	 */
729	dr_cpu_check_psrset(cpuids, res, req->num_records);
730
731	/* process each cpu that is part of the request */
732	for (idx = 0; idx < req->num_records; idx++) {
733
734		/* nothing to check if the CPU has already failed */
735		if (res[idx].result != DR_CPU_RES_OK)
736			continue;
737
738		if ((cp = cpu_get(cpuids[idx])) == NULL)
739			continue;
740
741		/*
742		 * Only check if there are bound threads if the
743		 * operation is not a forced unconfigure. In a
744		 * forced request, threads are automatically
745		 * unbound before they are offlined.
746		 */
747		if (req->msg_type == DR_CPU_UNCONFIGURE) {
748			/*
749			 * The return value is only interesting if other
750			 * checks are added to this loop and a decision
751			 * is needed on whether to continue checking.
752			 */
753			(void) dr_cpu_check_bound_thr(cp, &res[idx]);
754		}
755	}
756
757	mutex_exit(&cpu_lock);
758}
759
760/*
761 * Examine the processor set configuration for the specified
762 * CPUs and see if the unconfigure operation would result in
763 * trying to remove the last CPU in any processor set.
764 */
765static void
766dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres)
767{
768	int		cpu_idx;
769	int		set_idx;
770	cpu_t		*cp;
771	cpupart_t	*cpp;
772	char		err_str[DR_CPU_MAX_ERR_LEN];
773	size_t		err_len;
774	struct {
775		cpupart_t	*cpp;
776		int		ncpus;
777	} *psrset;
778
779	ASSERT(MUTEX_HELD(&cpu_lock));
780
781	/*
782	 * Allocate a scratch array to count the CPUs in
783	 * the various processor sets. A CPU always belongs
784	 * to exactly one processor set, so by definition,
785	 * the scratch array never needs to be larger than
786	 * the number of CPUs.
787	 */
788	psrset = kmem_zalloc(sizeof (*psrset) * nres, KM_SLEEP);
789	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
790	    __func__, (void *)psrset, sizeof (*psrset) * nres);
791
792	for (cpu_idx = 0; cpu_idx < nres; cpu_idx++) {
793
794		/* skip any CPUs that have already failed */
795		if (res[cpu_idx].result != DR_CPU_RES_OK)
796			continue;
797
798		if ((cp = cpu_get(cpuids[cpu_idx])) == NULL)
799			continue;
800
801		cpp = cp->cpu_part;
802
803		/* lookup the set this CPU belongs to */
804		for (set_idx = 0; set_idx < nres; set_idx++) {
805
806			/* matching set found */
807			if (cpp == psrset[set_idx].cpp)
808				break;
809
810			/* set not found, start a new entry */
811			if (psrset[set_idx].cpp == NULL) {
812				psrset[set_idx].cpp = cpp;
813				psrset[set_idx].ncpus = cpp->cp_ncpus;
814				break;
815			}
816		}
817
818		ASSERT(set_idx != nres);
819
820		/*
821		 * Remove the current CPU from the set total but only
822		 * generate an error for the last CPU. The correct CPU
823		 * will get the error because the unconfigure attempts
824		 * will occur in the same order in which the CPUs are
825		 * examined in this loop.  The cp_ncpus field of a
826		 * cpupart_t counts only online cpus, so it is safe
827		 * to remove an offline cpu without testing ncpus.
828		 */
829		if (cpu_is_offline(cp))
830			continue;
831
832		if (--psrset[set_idx].ncpus == 0) {
833			/*
834			 * Fill in the various pieces of information
835			 * to report that the operation will fail.
836			 */
837			res[cpu_idx].result = DR_CPU_RES_BLOCKED;
838			res[cpu_idx].status = DR_CPU_STAT_CONFIGURED;
839
840			(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN,
841			    "last online cpu in processor set %d", cpp->cp_id);
842
843			err_len = strlen(err_str) + 1;
844
845			res[cpu_idx].string = kmem_alloc(err_len, KM_SLEEP);
846			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
847			    __func__, (void *)(res[cpu_idx].string), err_len);
848			bcopy(err_str, res[cpu_idx].string, err_len);
849
850			DR_DBG_CPU("cpu %d: %s\n", cpuids[cpu_idx], err_str);
851		}
852	}
853
854	DR_DBG_KMEM("%s: free addr %p size %ld\n",
855	    __func__, (void *)psrset, sizeof (*psrset) * nres);
856	kmem_free(psrset, sizeof (*psrset) * nres);
857}
858
859/*
860 * Check if any threads are bound to the specified CPU. If the
861 * condition is true, DR_CPU_RES_BLOCKED is returned and an error
862 * string is generated and placed in the specified result structure.
863 * Otherwise, DR_CPU_RES_OK is returned.
864 */
865static int
866dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res)
867{
868	int		nbound;
869	proc_t		*pp;
870	kthread_t	*tp;
871	char		err_str[DR_CPU_MAX_ERR_LEN];
872	size_t		err_len;
873
874	/*
875	 * Error string allocation makes an assumption
876	 * that no blocking condition has been identified.
877	 */
878	ASSERT(res->result == DR_CPU_RES_OK);
879	ASSERT(res->string == NULL);
880
881	ASSERT(MUTEX_HELD(&cpu_lock));
882
883	mutex_enter(&pidlock);
884
885	nbound = 0;
886
887	/*
888	 * Walk the active processes, checking if each
889	 * thread belonging to the process is bound.
890	 */
891	for (pp = practive; (pp != NULL) && (nbound <= 1); pp = pp->p_next) {
892		mutex_enter(&pp->p_lock);
893
894		tp = pp->p_tlist;
895
896		if ((tp == NULL) || (pp->p_flag & SSYS)) {
897			mutex_exit(&pp->p_lock);
898			continue;
899		}
900
901		do {
902			if (tp->t_bind_cpu != cp->cpu_id)
903				continue;
904
905			/*
906			 * Update the running total of bound
907			 * threads. Continue the search until
908			 * it can be determined if more than
909			 * one thread is bound to the CPU.
910			 */
911			if (++nbound > 1)
912				break;
913
914		} while ((tp = tp->t_forw) != pp->p_tlist);
915
916		mutex_exit(&pp->p_lock);
917	}
918
919	mutex_exit(&pidlock);
920
921	if (nbound) {
922		/*
923		 * Threads are bound to the CPU. Fill in
924		 * various pieces of information to report
925		 * that the operation will fail.
926		 */
927		res->result = DR_CPU_RES_BLOCKED;
928		res->status = DR_CPU_STAT_CONFIGURED;
929
930		(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN, "cpu has bound "
931		    "thread%s", (nbound > 1) ? "s" : "");
932
933		err_len = strlen(err_str) + 1;
934
935		res->string = kmem_alloc(err_len, KM_SLEEP);
936		DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
937		    __func__, (void *)(res->string), err_len);
938		bcopy(err_str, res->string, err_len);
939
940		DR_DBG_CPU("cpu %d: %s\n", cp->cpu_id, err_str);
941	}
942
943	return (res->result);
944}
945
946/*
947 * Do not modify result buffer or length on error.
948 */
949static int
950dr_cpu_list_status(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
951{
952	int		idx;
953	int		result;
954	int		status;
955	int		rlen;
956	uint32_t	*cpuids;
957	dr_cpu_hdr_t	*rp;
958	dr_cpu_stat_t	*stat;
959	md_t		*mdp = NULL;
960	int		num_nodes;
961	int		listsz;
962	mde_cookie_t	*listp = NULL;
963	mde_cookie_t	cpunode;
964	boolean_t	walk_md = B_FALSE;
965
966	/* the incoming array of cpuids to configure */
967	cpuids = DR_CPU_CMD_CPUIDS(req);
968
969	/* allocate a response message */
970	rlen = sizeof (dr_cpu_hdr_t);
971	rlen += req->num_records * sizeof (dr_cpu_stat_t);
972	rp = kmem_zalloc(rlen, KM_SLEEP);
973	DR_DBG_KMEM("%s: alloc addr %p size %d\n", __func__, (void *)rp, rlen);
974
975	/* fill in the known data */
976	rp->req_num = req->req_num;
977	rp->msg_type = DR_CPU_STATUS;
978	rp->num_records = req->num_records;
979
980	/* stat array for the response */
981	stat = DR_CPU_RESP_STATS(rp);
982
983	/* get the status for each of the CPUs */
984	for (idx = 0; idx < req->num_records; idx++) {
985
986		result = dr_cpu_status(cpuids[idx], &status);
987
988		if (result == DR_CPU_RES_FAILURE)
989			walk_md = B_TRUE;
990
991		/* save off results of the status */
992		stat[idx].cpuid = cpuids[idx];
993		stat[idx].result = result;
994		stat[idx].status = status;
995	}
996
997	if (walk_md == B_FALSE)
998		goto done;
999
1000	/*
1001	 * At least one of the cpus did not have a CPU
1002	 * structure. So, consult the MD to determine if
1003	 * they are present.
1004	 */
1005
1006	if ((mdp = md_get_handle()) == NULL) {
1007		DR_DBG_CPU("unable to initialize MD\n");
1008		goto done;
1009	}
1010
1011	num_nodes = md_node_count(mdp);
1012	ASSERT(num_nodes > 0);
1013
1014	listsz = num_nodes * sizeof (mde_cookie_t);
1015	listp = kmem_zalloc(listsz, KM_SLEEP);
1016	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1017	    __func__, (void *)listp, listsz);
1018
1019	for (idx = 0; idx < req->num_records; idx++) {
1020
1021		if (stat[idx].result != DR_CPU_RES_FAILURE)
1022			continue;
1023
1024		/* check the MD for the current cpuid */
1025		cpunode = dr_cpu_find_node_md(stat[idx].cpuid, mdp, listp);
1026
1027		stat[idx].result = DR_CPU_RES_OK;
1028
1029		if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1030			stat[idx].status = DR_CPU_STAT_NOT_PRESENT;
1031		} else {
1032			stat[idx].status = DR_CPU_STAT_UNCONFIGURED;
1033		}
1034	}
1035
1036	DR_DBG_KMEM("%s: free addr %p size %d\n",
1037	    __func__, (void *)listp, listsz);
1038	kmem_free(listp, listsz);
1039
1040	(void) md_fini_handle(mdp);
1041
1042done:
1043	*resp = rp;
1044	*resp_len = rlen;
1045
1046	return (0);
1047}
1048
1049static int
1050dr_cpu_configure(processorid_t cpuid, int *status, boolean_t force)
1051{
1052	 _NOTE(ARGUNUSED(force))
1053	struct cpu	*cp;
1054	int		rv = 0;
1055
1056	DR_DBG_CPU("dr_cpu_configure...\n");
1057
1058	/*
1059	 * Build device tree node for the CPU
1060	 */
1061	if ((rv = dr_cpu_probe(cpuid)) != 0) {
1062		DR_DBG_CPU("failed to probe CPU %d (%d)\n", cpuid, rv);
1063		if (rv == EINVAL) {
1064			*status = DR_CPU_STAT_NOT_PRESENT;
1065			return (DR_CPU_RES_NOT_IN_MD);
1066		}
1067		*status = DR_CPU_STAT_UNCONFIGURED;
1068		return (DR_CPU_RES_FAILURE);
1069	}
1070
1071	mutex_enter(&cpu_lock);
1072
1073	/*
1074	 * Configure the CPU
1075	 */
1076	if ((cp = cpu_get(cpuid)) == NULL) {
1077
1078		if ((rv = cpu_configure(cpuid)) != 0) {
1079			DR_DBG_CPU("failed to configure CPU %d (%d)\n",
1080			    cpuid, rv);
1081			rv = DR_CPU_RES_FAILURE;
1082			*status = DR_CPU_STAT_UNCONFIGURED;
1083			goto done;
1084		}
1085
1086		DR_DBG_CPU("CPU %d configured\n", cpuid);
1087
1088		/* CPU struct should exist now */
1089		cp = cpu_get(cpuid);
1090	}
1091
1092	ASSERT(cp);
1093
1094	/*
1095	 * Power on the CPU. In sun4v, this brings the stopped
1096	 * CPU into the guest from the Hypervisor.
1097	 */
1098	if (cpu_is_poweredoff(cp)) {
1099
1100		if ((rv = cpu_poweron(cp)) != 0) {
1101			DR_DBG_CPU("failed to power on CPU %d (%d)\n",
1102			    cpuid, rv);
1103			rv = DR_CPU_RES_FAILURE;
1104			*status = DR_CPU_STAT_UNCONFIGURED;
1105			goto done;
1106		}
1107
1108		DR_DBG_CPU("CPU %d powered on\n", cpuid);
1109	}
1110
1111	/*
1112	 * Online the CPU
1113	 */
1114	if (cpu_is_offline(cp)) {
1115
1116		if ((rv = cpu_online(cp)) != 0) {
1117			DR_DBG_CPU("failed to online CPU %d (%d)\n",
1118			    cpuid, rv);
1119			rv = DR_CPU_RES_FAILURE;
1120			/* offline is still configured */
1121			*status = DR_CPU_STAT_CONFIGURED;
1122			goto done;
1123		}
1124
1125		DR_DBG_CPU("CPU %d online\n", cpuid);
1126	}
1127
1128	rv = DR_CPU_RES_OK;
1129	*status = DR_CPU_STAT_CONFIGURED;
1130
1131done:
1132	mutex_exit(&cpu_lock);
1133
1134	return (rv);
1135}
1136
1137static int
1138dr_cpu_unconfigure(processorid_t cpuid, int *status, boolean_t force)
1139{
1140	struct cpu	*cp;
1141	int		rv = 0;
1142	int		cpu_flags;
1143
1144	DR_DBG_CPU("dr_cpu_unconfigure%s...\n", (force) ? " (force)" : "");
1145
1146	mutex_enter(&cpu_lock);
1147
1148	cp = cpu_get(cpuid);
1149
1150	if (cp == NULL) {
1151		/*
1152		 * As OS CPU structures are already torn down proceed
1153		 * to deprobe device tree to make sure the device tree
1154		 * is up do date.
1155		 */
1156		goto deprobe;
1157	}
1158
1159	ASSERT(cp->cpu_id == cpuid);
1160
1161	/*
1162	 * Offline the CPU
1163	 */
1164	if (cpu_is_active(cp)) {
1165
1166		/* set the force flag correctly */
1167		cpu_flags = (force) ? CPU_FORCED : 0;
1168
1169		/*
1170		 * Before we take the CPU offline, we first enable interrupts.
1171		 * Otherwise, cpu_offline() might reject the request.  Note:
1172		 * if the offline subsequently fails, the target cpu will be
1173		 * left with interrupts enabled.  This is consistent with the
1174		 * behavior of psradm(1M) and p_online(2).
1175		 */
1176		cpu_intr_enable(cp);
1177
1178		if ((rv = cpu_offline(cp, cpu_flags)) != 0) {
1179			DR_DBG_CPU("failed to offline CPU %d (%d)\n",
1180			    cpuid, rv);
1181
1182			rv = DR_CPU_RES_FAILURE;
1183			*status = DR_CPU_STAT_CONFIGURED;
1184			mutex_exit(&cpu_lock);
1185			return (rv);
1186		}
1187
1188		DR_DBG_CPU("CPU %d offline\n", cpuid);
1189	}
1190
1191	/*
1192	 * Power off the CPU. In sun4v, this puts the running
1193	 * CPU into the stopped state in the Hypervisor.
1194	 */
1195	if (!cpu_is_poweredoff(cp)) {
1196
1197		if ((rv = cpu_poweroff(cp)) != 0) {
1198			DR_DBG_CPU("failed to power off CPU %d (%d)\n",
1199			    cpuid, rv);
1200			rv = DR_CPU_RES_FAILURE;
1201			*status = DR_CPU_STAT_CONFIGURED;
1202			mutex_exit(&cpu_lock);
1203			return (rv);
1204		}
1205
1206		DR_DBG_CPU("CPU %d powered off\n", cpuid);
1207	}
1208
1209	/*
1210	 * Unconfigure the CPU
1211	 */
1212	if ((rv = cpu_unconfigure(cpuid)) != 0) {
1213		DR_DBG_CPU("failed to unconfigure CPU %d (%d)\n", cpuid, rv);
1214		rv = DR_CPU_RES_FAILURE;
1215		*status = DR_CPU_STAT_UNCONFIGURED;
1216		mutex_exit(&cpu_lock);
1217		return (rv);
1218	}
1219
1220	DR_DBG_CPU("CPU %d unconfigured\n", cpuid);
1221
1222deprobe:
1223	mutex_exit(&cpu_lock);
1224	/*
1225	 * Tear down device tree.
1226	 */
1227	if ((rv = dr_cpu_deprobe(cpuid)) != 0) {
1228		DR_DBG_CPU("failed to deprobe CPU %d (%d)\n", cpuid, rv);
1229		rv = DR_CPU_RES_FAILURE;
1230		*status = DR_CPU_STAT_UNCONFIGURED;
1231		return (rv);
1232	}
1233
1234	rv = DR_CPU_RES_OK;
1235	*status = DR_CPU_STAT_UNCONFIGURED;
1236
1237	return (rv);
1238}
1239
1240/*
1241 * Determine the state of a CPU. If the CPU structure is not present,
1242 * it does not attempt to determine whether or not the CPU is in the
1243 * MD. It is more efficient to do this at the higher level for all
1244 * CPUs since it may not even be necessary to search the MD if all
1245 * the CPUs are accounted for. Returns DR_CPU_RES_OK if the CPU
1246 * structure is present, and DR_CPU_RES_FAILURE otherwise as a signal
1247 * that an MD walk is necessary.
1248 */
1249static int
1250dr_cpu_status(processorid_t cpuid, int *status)
1251{
1252	int		rv;
1253	struct cpu	*cp;
1254
1255	DR_DBG_CPU("dr_cpu_status...\n");
1256
1257	mutex_enter(&cpu_lock);
1258
1259	if ((cp = cpu_get(cpuid)) == NULL) {
1260		/* need to check if cpu is in the MD */
1261		rv = DR_CPU_RES_FAILURE;
1262		goto done;
1263	}
1264
1265	if (cpu_is_poweredoff(cp)) {
1266		/*
1267		 * The CPU is powered off, so it is considered
1268		 * unconfigured from the service entity point of
1269		 * view. The CPU is not available to the system
1270		 * and intervention by the service entity would
1271		 * be required to change that.
1272		 */
1273		*status = DR_CPU_STAT_UNCONFIGURED;
1274	} else {
1275		/*
1276		 * The CPU is powered on, so it is considered
1277		 * configured from the service entity point of
1278		 * view. It is available for use by the system
1279		 * and service entities are not concerned about
1280		 * the operational status (offline, online, etc.)
1281		 * of the CPU in terms of DR.
1282		 */
1283		*status = DR_CPU_STAT_CONFIGURED;
1284	}
1285
1286	rv = DR_CPU_RES_OK;
1287
1288done:
1289	mutex_exit(&cpu_lock);
1290
1291	return (rv);
1292}
1293
1294typedef struct {
1295	md_t		*mdp;
1296	mde_cookie_t	cpunode;
1297	dev_info_t	*dip;
1298} cb_arg_t;
1299
1300#define	STR_ARR_LEN	5
1301
1302static int
1303new_cpu_node(dev_info_t *new_node, void *arg, uint_t flags)
1304{
1305	_NOTE(ARGUNUSED(flags))
1306
1307	char		*compat;
1308	uint64_t	freq;
1309	uint64_t	cpuid = 0;
1310	int		regbuf[4];
1311	int		len = 0;
1312	cb_arg_t	*cba;
1313	char		*str_arr[STR_ARR_LEN];
1314	char		*curr;
1315	int		idx = 0;
1316
1317	DR_DBG_CPU("new_cpu_node...\n");
1318
1319	cba = (cb_arg_t *)arg;
1320
1321	/*
1322	 * Add 'name' property
1323	 */
1324	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1325	    "name", "cpu") != DDI_SUCCESS) {
1326		DR_DBG_CPU("new_cpu_node: failed to create 'name' property\n");
1327		return (DDI_WALK_ERROR);
1328	}
1329
1330	/*
1331	 * Add 'compatible' property
1332	 */
1333	if (md_get_prop_data(cba->mdp, cba->cpunode, "compatible",
1334	    (uint8_t **)(&compat), &len)) {
1335		DR_DBG_CPU("new_cpu_node: failed to read 'compatible' property "
1336		    "from MD\n");
1337		return (DDI_WALK_ERROR);
1338	}
1339
1340	DR_DBG_CPU("'compatible' len is %d\n", len);
1341
1342	/* parse the MD string array */
1343	curr = compat;
1344	while (curr < (compat + len)) {
1345
1346		DR_DBG_CPU("adding '%s' to 'compatible' property\n", curr);
1347
1348		str_arr[idx++] = curr;
1349		curr += strlen(curr) + 1;
1350
1351		if (idx == STR_ARR_LEN) {
1352			DR_DBG_CPU("exceeded str_arr len (%d)\n", STR_ARR_LEN);
1353			break;
1354		}
1355	}
1356
1357	if (ndi_prop_update_string_array(DDI_DEV_T_NONE, new_node,
1358	    "compatible", str_arr, idx) != DDI_SUCCESS) {
1359		DR_DBG_CPU("new_cpu_node: failed to create 'compatible' "
1360		    "property\n");
1361		return (DDI_WALK_ERROR);
1362	}
1363
1364	/*
1365	 * Add 'device_type' property
1366	 */
1367	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1368	    "device_type", "cpu") != DDI_SUCCESS) {
1369		DR_DBG_CPU("new_cpu_node: failed to create 'device_type' "
1370		    "property\n");
1371		return (DDI_WALK_ERROR);
1372	}
1373
1374	/*
1375	 * Add 'clock-frequency' property
1376	 */
1377	if (md_get_prop_val(cba->mdp, cba->cpunode, "clock-frequency", &freq)) {
1378		DR_DBG_CPU("new_cpu_node: failed to read 'clock-frequency' "
1379		    "property from MD\n");
1380		return (DDI_WALK_ERROR);
1381	}
1382
1383	if (ndi_prop_update_int(DDI_DEV_T_NONE, new_node,
1384	    "clock-frequency", freq) != DDI_SUCCESS) {
1385		DR_DBG_CPU("new_cpu_node: failed to create 'clock-frequency' "
1386		    "property\n");
1387		return (DDI_WALK_ERROR);
1388	}
1389
1390	/*
1391	 * Add 'reg' (cpuid) property
1392	 */
1393	if (md_get_prop_val(cba->mdp, cba->cpunode, "id", &cpuid)) {
1394		DR_DBG_CPU("new_cpu_node: failed to read 'id' property "
1395		    "from MD\n");
1396		return (DDI_WALK_ERROR);
1397	}
1398
1399	DR_DBG_CPU("new cpuid=0x%lx\n", cpuid);
1400
1401	bzero(regbuf, 4 * sizeof (int));
1402	regbuf[0] = 0xc0000000 | cpuid;
1403
1404	if (ndi_prop_update_int_array(DDI_DEV_T_NONE, new_node,
1405	    "reg", regbuf, 4) != DDI_SUCCESS) {
1406		DR_DBG_CPU("new_cpu_node: failed to create 'reg' property\n");
1407		return (DDI_WALK_ERROR);
1408	}
1409
1410	cba->dip = new_node;
1411
1412	return (DDI_WALK_TERMINATE);
1413}
1414
1415static int
1416dr_cpu_probe(processorid_t cpuid)
1417{
1418	dev_info_t	*pdip;
1419	dev_info_t	*dip;
1420	devi_branch_t	br;
1421	md_t		*mdp = NULL;
1422	int		num_nodes;
1423	int		rv = 0;
1424	int		listsz;
1425	mde_cookie_t	*listp = NULL;
1426	cb_arg_t	cba;
1427	mde_cookie_t	cpunode;
1428
1429	if ((dip = dr_cpu_find_node(cpuid)) != NULL) {
1430		/* nothing to do */
1431		e_ddi_branch_rele(dip);
1432		return (0);
1433	}
1434
1435	if ((mdp = md_get_handle()) == NULL) {
1436		DR_DBG_CPU("unable to initialize machine description\n");
1437		return (-1);
1438	}
1439
1440	num_nodes = md_node_count(mdp);
1441	ASSERT(num_nodes > 0);
1442
1443	listsz = num_nodes * sizeof (mde_cookie_t);
1444	listp = kmem_zalloc(listsz, KM_SLEEP);
1445	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1446	    __func__, (void *)listp, listsz);
1447
1448	cpunode = dr_cpu_find_node_md(cpuid, mdp, listp);
1449
1450	if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1451		rv = EINVAL;
1452		goto done;
1453	}
1454
1455	/* pass in MD cookie for CPU */
1456	cba.mdp = mdp;
1457	cba.cpunode = cpunode;
1458
1459	br.arg = (void *)&cba;
1460	br.type = DEVI_BRANCH_SID;
1461	br.create.sid_branch_create = new_cpu_node;
1462	br.devi_branch_callback = NULL;
1463	pdip = ddi_root_node();
1464
1465	if ((rv = e_ddi_branch_create(pdip, &br, NULL, 0))) {
1466		DR_DBG_CPU("e_ddi_branch_create failed: %d\n", rv);
1467		rv = -1;
1468		goto done;
1469	}
1470
1471	DR_DBG_CPU("CPU %d probed\n", cpuid);
1472
1473	rv = 0;
1474
1475done:
1476	if (listp) {
1477		DR_DBG_KMEM("%s: free addr %p size %d\n",
1478		    __func__, (void *)listp, listsz);
1479		kmem_free(listp, listsz);
1480	}
1481
1482	if (mdp)
1483		(void) md_fini_handle(mdp);
1484
1485	return (rv);
1486}
1487
1488static int
1489dr_cpu_deprobe(processorid_t cpuid)
1490{
1491	dev_info_t	*fdip = NULL;
1492	dev_info_t	*dip;
1493
1494	if ((dip = dr_cpu_find_node(cpuid)) == NULL) {
1495		DR_DBG_CPU("cpuid %d already deprobed\n", cpuid);
1496		return (0);
1497	}
1498
1499	ASSERT(e_ddi_branch_held(dip));
1500
1501	if (e_ddi_branch_destroy(dip, &fdip, 0)) {
1502		char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1503
1504		DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1505		    __func__, (void *)path, MAXPATHLEN);
1506		/*
1507		 * If non-NULL, fdip is held and must be released.
1508		 */
1509		if (fdip != NULL) {
1510			(void) ddi_pathname(fdip, path);
1511			ddi_release_devi(fdip);
1512		} else {
1513			(void) ddi_pathname(dip, path);
1514		}
1515		cmn_err(CE_NOTE, "node removal failed: %s (%p)",
1516		    path, (fdip) ? (void *)fdip : (void *)dip);
1517
1518		DR_DBG_KMEM("%s: free addr %p size %d\n",
1519		    __func__, (void *)path, MAXPATHLEN);
1520		kmem_free(path, MAXPATHLEN);
1521
1522		return (-1);
1523	}
1524
1525	DR_DBG_CPU("CPU %d deprobed\n", cpuid);
1526
1527	return (0);
1528}
1529
1530typedef struct {
1531	processorid_t	cpuid;
1532	dev_info_t	*dip;
1533} dr_search_arg_t;
1534
1535static int
1536dr_cpu_check_node(dev_info_t *dip, void *arg)
1537{
1538	char 		*name;
1539	processorid_t	cpuid;
1540	dr_search_arg_t	*sarg = (dr_search_arg_t *)arg;
1541
1542	if (dip == ddi_root_node()) {
1543		return (DDI_WALK_CONTINUE);
1544	}
1545
1546	name = ddi_node_name(dip);
1547
1548	if (strcmp(name, "cpu") != 0) {
1549		return (DDI_WALK_PRUNECHILD);
1550	}
1551
1552	cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1553	    "reg", -1);
1554
1555	cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
1556
1557	DR_DBG_CPU("found cpuid=0x%x, looking for 0x%x\n", cpuid, sarg->cpuid);
1558
1559	if (cpuid == sarg->cpuid) {
1560		DR_DBG_CPU("matching node\n");
1561
1562		/* matching node must be returned held */
1563		if (!e_ddi_branch_held(dip))
1564			e_ddi_branch_hold(dip);
1565
1566		sarg->dip = dip;
1567		return (DDI_WALK_TERMINATE);
1568	}
1569
1570	return (DDI_WALK_CONTINUE);
1571}
1572
1573/*
1574 * Walk the device tree to find the dip corresponding to the cpuid
1575 * passed in. If present, the dip is returned held. The caller must
1576 * release the hold on the dip once it is no longer required. If no
1577 * matching node if found, NULL is returned.
1578 */
1579static dev_info_t *
1580dr_cpu_find_node(processorid_t cpuid)
1581{
1582	dr_search_arg_t	arg;
1583
1584	DR_DBG_CPU("dr_cpu_find_node...\n");
1585
1586	arg.cpuid = cpuid;
1587	arg.dip = NULL;
1588
1589	ddi_walk_devs(ddi_root_node(), dr_cpu_check_node, &arg);
1590
1591	ASSERT((arg.dip == NULL) || (e_ddi_branch_held(arg.dip)));
1592
1593	return ((arg.dip) ? arg.dip : NULL);
1594}
1595
1596/*
1597 * Look up a particular cpuid in the MD. Returns the mde_cookie_t
1598 * representing that CPU if present, and MDE_INVAL_ELEM_COOKIE
1599 * otherwise. It is assumed the scratch array has already been
1600 * allocated so that it can accommodate the worst case scenario,
1601 * every node in the MD.
1602 */
1603static mde_cookie_t
1604dr_cpu_find_node_md(processorid_t cpuid, md_t *mdp, mde_cookie_t *listp)
1605{
1606	int		idx;
1607	int		nnodes;
1608	mde_cookie_t	rootnode;
1609	uint64_t	cpuid_prop;
1610	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1611
1612	rootnode = md_root_node(mdp);
1613	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1614
1615	/*
1616	 * Scan the DAG for all the CPU nodes
1617	 */
1618	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"),
1619	    md_find_name(mdp, "fwd"), listp);
1620
1621	if (nnodes < 0) {
1622		DR_DBG_CPU("Scan for CPUs failed\n");
1623		return (result);
1624	}
1625
1626	DR_DBG_CPU("dr_cpu_find_node_md: found %d CPUs in the MD\n", nnodes);
1627
1628	/*
1629	 * Find the CPU of interest
1630	 */
1631	for (idx = 0; idx < nnodes; idx++) {
1632
1633		if (md_get_prop_val(mdp, listp[idx], "id", &cpuid_prop)) {
1634			DR_DBG_CPU("Missing 'id' property for CPU node %d\n",
1635			    idx);
1636			break;
1637		}
1638
1639		if (cpuid_prop == cpuid) {
1640			/* found a match */
1641			DR_DBG_CPU("dr_cpu_find_node_md: found CPU %d "
1642			    "in MD\n", cpuid);
1643			result = listp[idx];
1644			break;
1645		}
1646	}
1647
1648	if (result == MDE_INVAL_ELEM_COOKIE) {
1649		DR_DBG_CPU("CPU %d not in MD\n", cpuid);
1650	}
1651
1652	return (result);
1653}
1654