1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
27 * detailed discussion of the overall mpxio architecture.
28 *
29 * Default locking order:
30 *
31 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
32 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
33 * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
34 * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
35 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
36 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
37 * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
38 */
39
40#include <sys/note.h>
41#include <sys/types.h>
42#include <sys/varargs.h>
43#include <sys/param.h>
44#include <sys/errno.h>
45#include <sys/uio.h>
46#include <sys/buf.h>
47#include <sys/modctl.h>
48#include <sys/open.h>
49#include <sys/kmem.h>
50#include <sys/poll.h>
51#include <sys/conf.h>
52#include <sys/bootconf.h>
53#include <sys/cmn_err.h>
54#include <sys/stat.h>
55#include <sys/ddi.h>
56#include <sys/sunddi.h>
57#include <sys/ddipropdefs.h>
58#include <sys/sunndi.h>
59#include <sys/ndi_impldefs.h>
60#include <sys/promif.h>
61#include <sys/sunmdi.h>
62#include <sys/mdi_impldefs.h>
63#include <sys/taskq.h>
64#include <sys/epm.h>
65#include <sys/sunpm.h>
66#include <sys/modhash.h>
67#include <sys/disp.h>
68#include <sys/autoconf.h>
69#include <sys/sysmacros.h>
70
71#ifdef	DEBUG
72#include <sys/debug.h>
73int	mdi_debug = 1;
74int	mdi_debug_logonly = 0;
75#define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
76#define	MDI_WARN	CE_WARN, __func__
77#define	MDI_NOTE	CE_NOTE, __func__
78#define	MDI_CONT	CE_CONT, __func__
79static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
80#else	/* !DEBUG */
81#define	MDI_DEBUG(dbglevel, pargs)
82#endif	/* DEBUG */
83int	mdi_debug_consoleonly = 0;
84int	mdi_delay = 3;
85
86extern pri_t	minclsyspri;
87extern int	modrootloaded;
88
89/*
90 * Global mutex:
91 * Protects vHCI list and structure members.
92 */
93kmutex_t	mdi_mutex;
94
95/*
96 * Registered vHCI class driver lists
97 */
98int		mdi_vhci_count;
99mdi_vhci_t	*mdi_vhci_head;
100mdi_vhci_t	*mdi_vhci_tail;
101
102/*
103 * Client Hash Table size
104 */
105static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
106
107/*
108 * taskq interface definitions
109 */
110#define	MDI_TASKQ_N_THREADS	8
111#define	MDI_TASKQ_PRI		minclsyspri
112#define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
113#define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
114
115taskq_t				*mdi_taskq;
116static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
117
118#define	TICKS_PER_SECOND	(drv_usectohz(1000000))
119
120/*
121 * The data should be "quiet" for this interval (in seconds) before the
122 * vhci cached data is flushed to the disk.
123 */
124static int mdi_vhcache_flush_delay = 10;
125
126/* number of seconds the vhcache flush daemon will sleep idle before exiting */
127static int mdi_vhcache_flush_daemon_idle_time = 60;
128
129/*
130 * MDI falls back to discovery of all paths when a bus_config_one fails.
131 * The following parameters can be used to tune this operation.
132 *
133 * mdi_path_discovery_boot
134 *	Number of times path discovery will be attempted during early boot.
135 *	Probably there is no reason to ever set this value to greater than one.
136 *
137 * mdi_path_discovery_postboot
138 *	Number of times path discovery will be attempted after early boot.
139 *	Set it to a minimum of two to allow for discovery of iscsi paths which
140 *	may happen very late during booting.
141 *
142 * mdi_path_discovery_interval
143 *	Minimum number of seconds MDI will wait between successive discovery
144 *	of all paths. Set it to -1 to disable discovery of all paths.
145 */
146static int mdi_path_discovery_boot = 1;
147static int mdi_path_discovery_postboot = 2;
148static int mdi_path_discovery_interval = 10;
149
150/*
151 * number of seconds the asynchronous configuration thread will sleep idle
152 * before exiting.
153 */
154static int mdi_async_config_idle_time = 600;
155
156static int mdi_bus_config_cache_hash_size = 256;
157
158/* turns off multithreaded configuration for certain operations */
159static int mdi_mtc_off = 0;
160
161/*
162 * The "path" to a pathinfo node is identical to the /devices path to a
163 * devinfo node had the device been enumerated under a pHCI instead of
164 * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
165 * This association persists across create/delete of the pathinfo nodes,
166 * but not across reboot.
167 */
168static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
169static int		mdi_pathmap_hash_size = 256;
170static kmutex_t		mdi_pathmap_mutex;
171static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
172static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
173static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
174
175/*
176 * MDI component property name/value string definitions
177 */
178const char 		*mdi_component_prop = "mpxio-component";
179const char		*mdi_component_prop_vhci = "vhci";
180const char		*mdi_component_prop_phci = "phci";
181const char		*mdi_component_prop_client = "client";
182
183/*
184 * MDI client global unique identifier property name
185 */
186const char		*mdi_client_guid_prop = "client-guid";
187
188/*
189 * MDI client load balancing property name/value string definitions
190 */
191const char		*mdi_load_balance = "load-balance";
192const char		*mdi_load_balance_none = "none";
193const char		*mdi_load_balance_rr = "round-robin";
194const char		*mdi_load_balance_lba = "logical-block";
195
196/*
197 * Obsolete vHCI class definition; to be removed after Leadville update
198 */
199const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
200
201static char vhci_greeting[] =
202	"\tThere already exists one vHCI driver for class %s\n"
203	"\tOnly one vHCI driver for each class is allowed\n";
204
205/*
206 * Static function prototypes
207 */
208static int		i_mdi_phci_offline(dev_info_t *, uint_t);
209static int		i_mdi_client_offline(dev_info_t *, uint_t);
210static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
211static void		i_mdi_phci_post_detach(dev_info_t *,
212			    ddi_detach_cmd_t, int);
213static int		i_mdi_client_pre_detach(dev_info_t *,
214			    ddi_detach_cmd_t);
215static void		i_mdi_client_post_detach(dev_info_t *,
216			    ddi_detach_cmd_t, int);
217static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
218static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
219static int 		i_mdi_lba_lb(mdi_client_t *ct,
220			    mdi_pathinfo_t **ret_pip, struct buf *buf);
221static void		i_mdi_pm_hold_client(mdi_client_t *, int);
222static void		i_mdi_pm_rele_client(mdi_client_t *, int);
223static void		i_mdi_pm_reset_client(mdi_client_t *);
224static int		i_mdi_power_all_phci(mdi_client_t *);
225static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
226
227
228/*
229 * Internal mdi_pathinfo node functions
230 */
231static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
232
233static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
234static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
235static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
236static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
237static void		i_mdi_phci_unlock(mdi_phci_t *);
238static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
239static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
240static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
241static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
242			    mdi_client_t *);
243static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
244static void		i_mdi_client_remove_path(mdi_client_t *,
245			    mdi_pathinfo_t *);
246
247static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
248			    mdi_pathinfo_state_t, int);
249static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
250static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
251			    char **, int);
252static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
253static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
254static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
255static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
256static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
257static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
258static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
259static void		i_mdi_client_update_state(mdi_client_t *);
260static int		i_mdi_client_compute_state(mdi_client_t *,
261			    mdi_phci_t *);
262static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
263static void		i_mdi_client_unlock(mdi_client_t *);
264static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
265static mdi_client_t	*i_devi_get_client(dev_info_t *);
266/*
267 * NOTE: this will be removed once the NWS files are changed to use the new
268 * mdi_{enable,disable}_path interfaces
269 */
270static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
271				int, int);
272static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
273				mdi_vhci_t *vh, int flags, int op);
274/*
275 * Failover related function prototypes
276 */
277static int		i_mdi_failover(void *);
278
279/*
280 * misc internal functions
281 */
282static int		i_mdi_get_hash_key(char *);
283static int		i_map_nvlist_error_to_mdi(int);
284static void		i_mdi_report_path_state(mdi_client_t *,
285			    mdi_pathinfo_t *);
286
287static void		setup_vhci_cache(mdi_vhci_t *);
288static int		destroy_vhci_cache(mdi_vhci_t *);
289static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
290static boolean_t	stop_vhcache_flush_thread(void *, int);
291static void		free_string_array(char **, int);
292static void		free_vhcache_phci(mdi_vhcache_phci_t *);
293static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
294static void		free_vhcache_client(mdi_vhcache_client_t *);
295static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
296static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
297static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
298static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
299static void		vhcache_pi_add(mdi_vhci_config_t *,
300			    struct mdi_pathinfo *);
301static void		vhcache_pi_remove(mdi_vhci_config_t *,
302			    struct mdi_pathinfo *);
303static void		free_phclient_path_list(mdi_phys_path_t *);
304static void		sort_vhcache_paths(mdi_vhcache_client_t *);
305static int		flush_vhcache(mdi_vhci_config_t *, int);
306static void		vhcache_dirty(mdi_vhci_config_t *);
307static void		free_async_client_config(mdi_async_client_config_t *);
308static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
309static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
310static nvlist_t		*read_on_disk_vhci_cache(char *);
311extern int		fread_nvlist(char *, nvlist_t **);
312extern int		fwrite_nvlist(char *, nvlist_t *);
313
314/* called once when first vhci registers with mdi */
315static void
316i_mdi_init()
317{
318	static int initialized = 0;
319
320	if (initialized)
321		return;
322	initialized = 1;
323
324	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
325
326	/* Create our taskq resources */
327	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
328	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
329	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
330	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
331
332	/* Allocate ['path_instance' <-> "path"] maps */
333	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
334	mdi_pathmap_bypath = mod_hash_create_strhash(
335	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
336	    mod_hash_null_valdtor);
337	mdi_pathmap_byinstance = mod_hash_create_idhash(
338	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
339	    mod_hash_null_valdtor);
340	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
341	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
342	    mod_hash_null_valdtor);
343}
344
345/*
346 * mdi_get_component_type():
347 *		Return mpxio component type
348 * Return Values:
349 *		MDI_COMPONENT_NONE
350 *		MDI_COMPONENT_VHCI
351 *		MDI_COMPONENT_PHCI
352 *		MDI_COMPONENT_CLIENT
353 * XXX This doesn't work under multi-level MPxIO and should be
354 *	removed when clients migrate mdi_component_is_*() interfaces.
355 */
356int
357mdi_get_component_type(dev_info_t *dip)
358{
359	return (DEVI(dip)->devi_mdi_component);
360}
361
362/*
363 * mdi_vhci_register():
364 *		Register a vHCI module with the mpxio framework
365 *		mdi_vhci_register() is called by vHCI drivers to register the
366 *		'class_driver' vHCI driver and its MDI entrypoints with the
367 *		mpxio framework.  The vHCI driver must call this interface as
368 *		part of its attach(9e) handler.
369 *		Competing threads may try to attach mdi_vhci_register() as
370 *		the vHCI drivers are loaded and attached as a result of pHCI
371 *		driver instance registration (mdi_phci_register()) with the
372 *		framework.
373 * Return Values:
374 *		MDI_SUCCESS
375 *		MDI_FAILURE
376 */
377/*ARGSUSED*/
378int
379mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
380    int flags)
381{
382	mdi_vhci_t		*vh = NULL;
383
384	/* Registrant can't be older */
385	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
386
387#ifdef DEBUG
388	/*
389	 * IB nexus driver is loaded only when IB hardware is present.
390	 * In order to be able to do this there is a need to drive the loading
391	 * and attaching of the IB nexus driver (especially when an IB hardware
392	 * is dynamically plugged in) when an IB HCA driver (PHCI)
393	 * is being attached. Unfortunately this gets into the limitations
394	 * of devfs as there seems to be no clean way to drive configuration
395	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
396	 * for IB.
397	 */
398	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
399		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
400#endif
401
402	i_mdi_init();
403
404	mutex_enter(&mdi_mutex);
405	/*
406	 * Scan for already registered vhci
407	 */
408	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
409		if (strcmp(vh->vh_class, class) == 0) {
410			/*
411			 * vHCI has already been created.  Check for valid
412			 * vHCI ops registration.  We only support one vHCI
413			 * module per class
414			 */
415			if (vh->vh_ops != NULL) {
416				mutex_exit(&mdi_mutex);
417				cmn_err(CE_NOTE, vhci_greeting, class);
418				return (MDI_FAILURE);
419			}
420			break;
421		}
422	}
423
424	/*
425	 * if not yet created, create the vHCI component
426	 */
427	if (vh == NULL) {
428		struct client_hash	*hash = NULL;
429		char			*load_balance;
430
431		/*
432		 * Allocate and initialize the mdi extensions
433		 */
434		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
435		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
436		    KM_SLEEP);
437		vh->vh_client_table = hash;
438		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
439		(void) strcpy(vh->vh_class, class);
440		vh->vh_lb = LOAD_BALANCE_RR;
441		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
442		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
443			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
444				vh->vh_lb = LOAD_BALANCE_NONE;
445			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
446				    == 0) {
447				vh->vh_lb = LOAD_BALANCE_LBA;
448			}
449			ddi_prop_free(load_balance);
450		}
451
452		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
453		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
454
455		/*
456		 * Store the vHCI ops vectors
457		 */
458		vh->vh_dip = vdip;
459		vh->vh_ops = vops;
460
461		setup_vhci_cache(vh);
462
463		if (mdi_vhci_head == NULL) {
464			mdi_vhci_head = vh;
465		}
466		if (mdi_vhci_tail) {
467			mdi_vhci_tail->vh_next = vh;
468		}
469		mdi_vhci_tail = vh;
470		mdi_vhci_count++;
471	}
472
473	/*
474	 * Claim the devfs node as a vhci component
475	 */
476	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
477
478	/*
479	 * Initialize our back reference from dev_info node
480	 */
481	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
482	mutex_exit(&mdi_mutex);
483	return (MDI_SUCCESS);
484}
485
486/*
487 * mdi_vhci_unregister():
488 *		Unregister a vHCI module from mpxio framework
489 *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
490 * 		of a vhci to unregister it from the framework.
491 * Return Values:
492 *		MDI_SUCCESS
493 *		MDI_FAILURE
494 */
495/*ARGSUSED*/
496int
497mdi_vhci_unregister(dev_info_t *vdip, int flags)
498{
499	mdi_vhci_t	*found, *vh, *prev = NULL;
500
501	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
502
503	/*
504	 * Check for invalid VHCI
505	 */
506	if ((vh = i_devi_get_vhci(vdip)) == NULL)
507		return (MDI_FAILURE);
508
509	/*
510	 * Scan the list of registered vHCIs for a match
511	 */
512	mutex_enter(&mdi_mutex);
513	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
514		if (found == vh)
515			break;
516		prev = found;
517	}
518
519	if (found == NULL) {
520		mutex_exit(&mdi_mutex);
521		return (MDI_FAILURE);
522	}
523
524	/*
525	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
526	 * should have been unregistered, before a vHCI can be
527	 * unregistered.
528	 */
529	MDI_VHCI_PHCI_LOCK(vh);
530	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
531		MDI_VHCI_PHCI_UNLOCK(vh);
532		mutex_exit(&mdi_mutex);
533		return (MDI_FAILURE);
534	}
535	MDI_VHCI_PHCI_UNLOCK(vh);
536
537	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
538		mutex_exit(&mdi_mutex);
539		return (MDI_FAILURE);
540	}
541
542	/*
543	 * Remove the vHCI from the global list
544	 */
545	if (vh == mdi_vhci_head) {
546		mdi_vhci_head = vh->vh_next;
547	} else {
548		prev->vh_next = vh->vh_next;
549	}
550	if (vh == mdi_vhci_tail) {
551		mdi_vhci_tail = prev;
552	}
553	mdi_vhci_count--;
554	mutex_exit(&mdi_mutex);
555
556	vh->vh_ops = NULL;
557	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
558	DEVI(vdip)->devi_mdi_xhci = NULL;
559	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
560	kmem_free(vh->vh_client_table,
561	    mdi_client_table_size * sizeof (struct client_hash));
562	mutex_destroy(&vh->vh_phci_mutex);
563	mutex_destroy(&vh->vh_client_mutex);
564
565	kmem_free(vh, sizeof (mdi_vhci_t));
566	return (MDI_SUCCESS);
567}
568
569/*
570 * i_mdi_vhci_class2vhci():
571 *		Look for a matching vHCI module given a vHCI class name
572 * Return Values:
573 *		Handle to a vHCI component
574 *		NULL
575 */
576static mdi_vhci_t *
577i_mdi_vhci_class2vhci(char *class)
578{
579	mdi_vhci_t	*vh = NULL;
580
581	ASSERT(!MUTEX_HELD(&mdi_mutex));
582
583	mutex_enter(&mdi_mutex);
584	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
585		if (strcmp(vh->vh_class, class) == 0) {
586			break;
587		}
588	}
589	mutex_exit(&mdi_mutex);
590	return (vh);
591}
592
593/*
594 * i_devi_get_vhci():
595 *		Utility function to get the handle to a vHCI component
596 * Return Values:
597 *		Handle to a vHCI component
598 *		NULL
599 */
600mdi_vhci_t *
601i_devi_get_vhci(dev_info_t *vdip)
602{
603	mdi_vhci_t	*vh = NULL;
604	if (MDI_VHCI(vdip)) {
605		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
606	}
607	return (vh);
608}
609
610/*
611 * mdi_phci_register():
612 *		Register a pHCI module with mpxio framework
613 *		mdi_phci_register() is called by pHCI drivers to register with
614 *		the mpxio framework and a specific 'class_driver' vHCI.  The
615 *		pHCI driver must call this interface as part of its attach(9e)
616 *		handler.
617 * Return Values:
618 *		MDI_SUCCESS
619 *		MDI_FAILURE
620 */
621/*ARGSUSED*/
622int
623mdi_phci_register(char *class, dev_info_t *pdip, int flags)
624{
625	mdi_phci_t		*ph;
626	mdi_vhci_t		*vh;
627	char			*data;
628
629	/*
630	 * Some subsystems, like fcp, perform pHCI registration from a
631	 * different thread than the one doing the pHCI attach(9E) - the
632	 * driver attach code is waiting for this other thread to complete.
633	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
634	 * (indicating that some thread has done an ndi_devi_enter of parent)
635	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
636	 */
637	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
638
639	/*
640	 * Check for mpxio-disable property. Enable mpxio if the property is
641	 * missing or not set to "yes".
642	 * If the property is set to "yes" then emit a brief message.
643	 */
644	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
645	    &data) == DDI_SUCCESS)) {
646		if (strcmp(data, "yes") == 0) {
647			MDI_DEBUG(1, (MDI_CONT, pdip,
648			    "?multipath capabilities disabled via %s.conf.",
649			    ddi_driver_name(pdip)));
650			ddi_prop_free(data);
651			return (MDI_FAILURE);
652		}
653		ddi_prop_free(data);
654	}
655
656	/*
657	 * Search for a matching vHCI
658	 */
659	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
660	if (vh == NULL) {
661		return (MDI_FAILURE);
662	}
663
664	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
665	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
666	ph->ph_dip = pdip;
667	ph->ph_vhci = vh;
668	ph->ph_next = NULL;
669	ph->ph_unstable = 0;
670	ph->ph_vprivate = 0;
671	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
672
673	MDI_PHCI_LOCK(ph);
674	MDI_PHCI_SET_POWER_UP(ph);
675	MDI_PHCI_UNLOCK(ph);
676	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
677	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
678
679	vhcache_phci_add(vh->vh_config, ph);
680
681	MDI_VHCI_PHCI_LOCK(vh);
682	if (vh->vh_phci_head == NULL) {
683		vh->vh_phci_head = ph;
684	}
685	if (vh->vh_phci_tail) {
686		vh->vh_phci_tail->ph_next = ph;
687	}
688	vh->vh_phci_tail = ph;
689	vh->vh_phci_count++;
690	MDI_VHCI_PHCI_UNLOCK(vh);
691
692	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
693	return (MDI_SUCCESS);
694}
695
696/*
697 * mdi_phci_unregister():
698 *		Unregister a pHCI module from mpxio framework
699 *		mdi_phci_unregister() is called by the pHCI drivers from their
700 *		detach(9E) handler to unregister their instances from the
701 *		framework.
702 * Return Values:
703 *		MDI_SUCCESS
704 *		MDI_FAILURE
705 */
706/*ARGSUSED*/
707int
708mdi_phci_unregister(dev_info_t *pdip, int flags)
709{
710	mdi_vhci_t		*vh;
711	mdi_phci_t		*ph;
712	mdi_phci_t		*tmp;
713	mdi_phci_t		*prev = NULL;
714	mdi_pathinfo_t		*pip;
715
716	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
717
718	ph = i_devi_get_phci(pdip);
719	if (ph == NULL) {
720		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
721		return (MDI_FAILURE);
722	}
723
724	vh = ph->ph_vhci;
725	ASSERT(vh != NULL);
726	if (vh == NULL) {
727		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
728		return (MDI_FAILURE);
729	}
730
731	MDI_VHCI_PHCI_LOCK(vh);
732	tmp = vh->vh_phci_head;
733	while (tmp) {
734		if (tmp == ph) {
735			break;
736		}
737		prev = tmp;
738		tmp = tmp->ph_next;
739	}
740
741	if (ph == vh->vh_phci_head) {
742		vh->vh_phci_head = ph->ph_next;
743	} else {
744		prev->ph_next = ph->ph_next;
745	}
746
747	if (ph == vh->vh_phci_tail) {
748		vh->vh_phci_tail = prev;
749	}
750
751	vh->vh_phci_count--;
752	MDI_VHCI_PHCI_UNLOCK(vh);
753
754	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
755	MDI_PHCI_LOCK(ph);
756	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
757	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
758		MDI_PI(pip)->pi_phci = NULL;
759	MDI_PHCI_UNLOCK(ph);
760
761	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
762	    ESC_DDI_INITIATOR_UNREGISTER);
763	vhcache_phci_remove(vh->vh_config, ph);
764	cv_destroy(&ph->ph_unstable_cv);
765	mutex_destroy(&ph->ph_mutex);
766	kmem_free(ph, sizeof (mdi_phci_t));
767	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
768	DEVI(pdip)->devi_mdi_xhci = NULL;
769	return (MDI_SUCCESS);
770}
771
772/*
773 * i_devi_get_phci():
774 * 		Utility function to return the phci extensions.
775 */
776static mdi_phci_t *
777i_devi_get_phci(dev_info_t *pdip)
778{
779	mdi_phci_t	*ph = NULL;
780
781	if (MDI_PHCI(pdip)) {
782		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
783	}
784	return (ph);
785}
786
787/*
788 * Single thread mdi entry into devinfo node for modifying its children.
789 * If necessary we perform an ndi_devi_enter of the vHCI before doing
790 * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
791 * for the vHCI and one for the pHCI.
792 */
793void
794mdi_devi_enter(dev_info_t *phci_dip, int *circular)
795{
796	dev_info_t	*vdip;
797	int		vcircular, pcircular;
798
799	/* Verify calling context */
800	ASSERT(MDI_PHCI(phci_dip));
801	vdip = mdi_devi_get_vdip(phci_dip);
802	ASSERT(vdip);			/* A pHCI always has a vHCI */
803
804	/*
805	 * If pHCI is detaching then the framework has already entered the
806	 * vHCI on a threads that went down the code path leading to
807	 * detach_node().  This framework enter of the vHCI during pHCI
808	 * detach is done to avoid deadlock with vHCI power management
809	 * operations which enter the vHCI and the enter down the path
810	 * to the pHCI. If pHCI is detaching then we piggyback this calls
811	 * enter of the vHCI on frameworks vHCI enter that has already
812	 * occurred - this is OK because we know that the framework thread
813	 * doing detach is waiting for our completion.
814	 *
815	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
816	 * race with detach - but we can't do that because the framework has
817	 * already entered the parent, so we have some complexity instead.
818	 */
819	for (;;) {
820		if (ndi_devi_tryenter(vdip, &vcircular)) {
821			ASSERT(vcircular != -1);
822			if (DEVI_IS_DETACHING(phci_dip)) {
823				ndi_devi_exit(vdip, vcircular);
824				vcircular = -1;
825			}
826			break;
827		} else if (DEVI_IS_DETACHING(phci_dip)) {
828			vcircular = -1;
829			break;
830		} else if (servicing_interrupt()) {
831			/*
832			 * Don't delay an interrupt (and ensure adaptive
833			 * mutex inversion support).
834			 */
835			ndi_devi_enter(vdip, &vcircular);
836			break;
837		} else {
838			delay_random(mdi_delay);
839		}
840	}
841
842	ndi_devi_enter(phci_dip, &pcircular);
843	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
844}
845
846/*
847 * Attempt to mdi_devi_enter.
848 */
849int
850mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
851{
852	dev_info_t	*vdip;
853	int		vcircular, pcircular;
854
855	/* Verify calling context */
856	ASSERT(MDI_PHCI(phci_dip));
857	vdip = mdi_devi_get_vdip(phci_dip);
858	ASSERT(vdip);			/* A pHCI always has a vHCI */
859
860	if (ndi_devi_tryenter(vdip, &vcircular)) {
861		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
862			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
863			return (1);	/* locked */
864		}
865		ndi_devi_exit(vdip, vcircular);
866	}
867	return (0);			/* busy */
868}
869
870/*
871 * Release mdi_devi_enter or successful mdi_devi_tryenter.
872 */
873void
874mdi_devi_exit(dev_info_t *phci_dip, int circular)
875{
876	dev_info_t	*vdip;
877	int		vcircular, pcircular;
878
879	/* Verify calling context */
880	ASSERT(MDI_PHCI(phci_dip));
881	vdip = mdi_devi_get_vdip(phci_dip);
882	ASSERT(vdip);			/* A pHCI always has a vHCI */
883
884	/* extract two circular recursion values from single int */
885	pcircular = (short)(circular & 0xFFFF);
886	vcircular = (short)((circular >> 16) & 0xFFFF);
887
888	ndi_devi_exit(phci_dip, pcircular);
889	if (vcircular != -1)
890		ndi_devi_exit(vdip, vcircular);
891}
892
893/*
894 * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
895 * around a pHCI drivers calls to mdi_pi_online/offline, after holding
896 * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
897 * with vHCI power management code during path online/offline.  Each
898 * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
899 * occur within the scope of an active mdi_devi_enter that establishes the
900 * circular value.
901 */
902void
903mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
904{
905	int		pcircular;
906
907	/* Verify calling context */
908	ASSERT(MDI_PHCI(phci_dip));
909
910	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
911	ndi_hold_devi(phci_dip);
912
913	pcircular = (short)(circular & 0xFFFF);
914	ndi_devi_exit(phci_dip, pcircular);
915}
916
917void
918mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
919{
920	int		pcircular;
921
922	/* Verify calling context */
923	ASSERT(MDI_PHCI(phci_dip));
924
925	ndi_devi_enter(phci_dip, &pcircular);
926
927	/* Drop hold from mdi_devi_exit_phci. */
928	ndi_rele_devi(phci_dip);
929
930	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
931	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
932}
933
934/*
935 * mdi_devi_get_vdip():
936 *		given a pHCI dip return vHCI dip
937 */
938dev_info_t *
939mdi_devi_get_vdip(dev_info_t *pdip)
940{
941	mdi_phci_t	*ph;
942
943	ph = i_devi_get_phci(pdip);
944	if (ph && ph->ph_vhci)
945		return (ph->ph_vhci->vh_dip);
946	return (NULL);
947}
948
949/*
950 * mdi_devi_pdip_entered():
951 *		Return 1 if we are vHCI and have done an ndi_devi_enter
952 *		of a pHCI
953 */
954int
955mdi_devi_pdip_entered(dev_info_t *vdip)
956{
957	mdi_vhci_t	*vh;
958	mdi_phci_t	*ph;
959
960	vh = i_devi_get_vhci(vdip);
961	if (vh == NULL)
962		return (0);
963
964	MDI_VHCI_PHCI_LOCK(vh);
965	ph = vh->vh_phci_head;
966	while (ph) {
967		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
968			MDI_VHCI_PHCI_UNLOCK(vh);
969			return (1);
970		}
971		ph = ph->ph_next;
972	}
973	MDI_VHCI_PHCI_UNLOCK(vh);
974	return (0);
975}
976
977/*
978 * mdi_phci_path2devinfo():
979 * 		Utility function to search for a valid phci device given
980 *		the devfs pathname.
981 */
982dev_info_t *
983mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
984{
985	char		*temp_pathname;
986	mdi_vhci_t	*vh;
987	mdi_phci_t	*ph;
988	dev_info_t 	*pdip = NULL;
989
990	vh = i_devi_get_vhci(vdip);
991	ASSERT(vh != NULL);
992
993	if (vh == NULL) {
994		/*
995		 * Invalid vHCI component, return failure
996		 */
997		return (NULL);
998	}
999
1000	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1001	MDI_VHCI_PHCI_LOCK(vh);
1002	ph = vh->vh_phci_head;
1003	while (ph != NULL) {
1004		pdip = ph->ph_dip;
1005		ASSERT(pdip != NULL);
1006		*temp_pathname = '\0';
1007		(void) ddi_pathname(pdip, temp_pathname);
1008		if (strcmp(temp_pathname, pathname) == 0) {
1009			break;
1010		}
1011		ph = ph->ph_next;
1012	}
1013	if (ph == NULL) {
1014		pdip = NULL;
1015	}
1016	MDI_VHCI_PHCI_UNLOCK(vh);
1017	kmem_free(temp_pathname, MAXPATHLEN);
1018	return (pdip);
1019}
1020
1021/*
1022 * mdi_phci_get_path_count():
1023 * 		get number of path information nodes associated with a given
1024 *		pHCI device.
1025 */
1026int
1027mdi_phci_get_path_count(dev_info_t *pdip)
1028{
1029	mdi_phci_t	*ph;
1030	int		count = 0;
1031
1032	ph = i_devi_get_phci(pdip);
1033	if (ph != NULL) {
1034		count = ph->ph_path_count;
1035	}
1036	return (count);
1037}
1038
1039/*
1040 * i_mdi_phci_lock():
1041 *		Lock a pHCI device
1042 * Return Values:
1043 *		None
1044 * Note:
1045 *		The default locking order is:
1046 *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1047 *		But there are number of situations where locks need to be
1048 *		grabbed in reverse order.  This routine implements try and lock
1049 *		mechanism depending on the requested parameter option.
1050 */
1051static void
1052i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1053{
1054	if (pip) {
1055		/* Reverse locking is requested. */
1056		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1057			if (servicing_interrupt()) {
1058				MDI_PI_HOLD(pip);
1059				MDI_PI_UNLOCK(pip);
1060				MDI_PHCI_LOCK(ph);
1061				MDI_PI_LOCK(pip);
1062				MDI_PI_RELE(pip);
1063				break;
1064			} else {
1065				/*
1066				 * tryenter failed. Try to grab again
1067				 * after a small delay
1068				 */
1069				MDI_PI_HOLD(pip);
1070				MDI_PI_UNLOCK(pip);
1071				delay_random(mdi_delay);
1072				MDI_PI_LOCK(pip);
1073				MDI_PI_RELE(pip);
1074			}
1075		}
1076	} else {
1077		MDI_PHCI_LOCK(ph);
1078	}
1079}
1080
1081/*
1082 * i_mdi_phci_unlock():
1083 *		Unlock the pHCI component
1084 */
1085static void
1086i_mdi_phci_unlock(mdi_phci_t *ph)
1087{
1088	MDI_PHCI_UNLOCK(ph);
1089}
1090
1091/*
1092 * i_mdi_devinfo_create():
1093 *		create client device's devinfo node
1094 * Return Values:
1095 *		dev_info
1096 *		NULL
1097 * Notes:
1098 */
1099static dev_info_t *
1100i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1101	char **compatible, int ncompatible)
1102{
1103	dev_info_t *cdip = NULL;
1104
1105	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1106
1107	/* Verify for duplicate entry */
1108	cdip = i_mdi_devinfo_find(vh, name, guid);
1109	ASSERT(cdip == NULL);
1110	if (cdip) {
1111		cmn_err(CE_WARN,
1112		    "i_mdi_devinfo_create: client %s@%s already exists",
1113			name ? name : "", guid ? guid : "");
1114	}
1115
1116	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1117	if (cdip == NULL)
1118		goto fail;
1119
1120	/*
1121	 * Create component type and Global unique identifier
1122	 * properties
1123	 */
1124	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1125	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1126		goto fail;
1127	}
1128
1129	/* Decorate the node with compatible property */
1130	if (compatible &&
1131	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1132	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1133		goto fail;
1134	}
1135
1136	return (cdip);
1137
1138fail:
1139	if (cdip) {
1140		(void) ndi_prop_remove_all(cdip);
1141		(void) ndi_devi_free(cdip);
1142	}
1143	return (NULL);
1144}
1145
1146/*
1147 * i_mdi_devinfo_find():
1148 *		Find a matching devinfo node for given client node name
1149 *		and its guid.
1150 * Return Values:
1151 *		Handle to a dev_info node or NULL
1152 */
1153static dev_info_t *
1154i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1155{
1156	char			*data;
1157	dev_info_t 		*cdip = NULL;
1158	dev_info_t 		*ndip = NULL;
1159	int			circular;
1160
1161	ndi_devi_enter(vh->vh_dip, &circular);
1162	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1163	while ((cdip = ndip) != NULL) {
1164		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1165
1166		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1167			continue;
1168		}
1169
1170		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1171		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1172		    &data) != DDI_PROP_SUCCESS) {
1173			continue;
1174		}
1175
1176		if (strcmp(data, guid) != 0) {
1177			ddi_prop_free(data);
1178			continue;
1179		}
1180		ddi_prop_free(data);
1181		break;
1182	}
1183	ndi_devi_exit(vh->vh_dip, circular);
1184	return (cdip);
1185}
1186
1187/*
1188 * i_mdi_devinfo_remove():
1189 *		Remove a client device node
1190 */
1191static int
1192i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1193{
1194	int	rv = MDI_SUCCESS;
1195
1196	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1197	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1198		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1199		if (rv != NDI_SUCCESS) {
1200			MDI_DEBUG(1, (MDI_NOTE, cdip,
1201			    "!failed: cdip %p", (void *)cdip));
1202		}
1203		/*
1204		 * Convert to MDI error code
1205		 */
1206		switch (rv) {
1207		case NDI_SUCCESS:
1208			rv = MDI_SUCCESS;
1209			break;
1210		case NDI_BUSY:
1211			rv = MDI_BUSY;
1212			break;
1213		default:
1214			rv = MDI_FAILURE;
1215			break;
1216		}
1217	}
1218	return (rv);
1219}
1220
1221/*
1222 * i_devi_get_client()
1223 *		Utility function to get mpxio component extensions
1224 */
1225static mdi_client_t *
1226i_devi_get_client(dev_info_t *cdip)
1227{
1228	mdi_client_t	*ct = NULL;
1229
1230	if (MDI_CLIENT(cdip)) {
1231		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1232	}
1233	return (ct);
1234}
1235
1236/*
1237 * i_mdi_is_child_present():
1238 *		Search for the presence of client device dev_info node
1239 */
1240static int
1241i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1242{
1243	int		rv = MDI_FAILURE;
1244	struct dev_info	*dip;
1245	int		circular;
1246
1247	ndi_devi_enter(vdip, &circular);
1248	dip = DEVI(vdip)->devi_child;
1249	while (dip) {
1250		if (dip == DEVI(cdip)) {
1251			rv = MDI_SUCCESS;
1252			break;
1253		}
1254		dip = dip->devi_sibling;
1255	}
1256	ndi_devi_exit(vdip, circular);
1257	return (rv);
1258}
1259
1260
1261/*
1262 * i_mdi_client_lock():
1263 *		Grab client component lock
1264 * Return Values:
1265 *		None
1266 * Note:
1267 *		The default locking order is:
1268 *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1269 *		But there are number of situations where locks need to be
1270 *		grabbed in reverse order.  This routine implements try and lock
1271 *		mechanism depending on the requested parameter option.
1272 */
1273static void
1274i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1275{
1276	if (pip) {
1277		/*
1278		 * Reverse locking is requested.
1279		 */
1280		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1281			if (servicing_interrupt()) {
1282				MDI_PI_HOLD(pip);
1283				MDI_PI_UNLOCK(pip);
1284				MDI_CLIENT_LOCK(ct);
1285				MDI_PI_LOCK(pip);
1286				MDI_PI_RELE(pip);
1287				break;
1288			} else {
1289				/*
1290				 * tryenter failed. Try to grab again
1291				 * after a small delay
1292				 */
1293				MDI_PI_HOLD(pip);
1294				MDI_PI_UNLOCK(pip);
1295				delay_random(mdi_delay);
1296				MDI_PI_LOCK(pip);
1297				MDI_PI_RELE(pip);
1298			}
1299		}
1300	} else {
1301		MDI_CLIENT_LOCK(ct);
1302	}
1303}
1304
1305/*
1306 * i_mdi_client_unlock():
1307 *		Unlock a client component
1308 */
1309static void
1310i_mdi_client_unlock(mdi_client_t *ct)
1311{
1312	MDI_CLIENT_UNLOCK(ct);
1313}
1314
1315/*
1316 * i_mdi_client_alloc():
1317 * 		Allocate and initialize a client structure.  Caller should
1318 *		hold the vhci client lock.
1319 * Return Values:
1320 *		Handle to a client component
1321 */
1322/*ARGSUSED*/
1323static mdi_client_t *
1324i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1325{
1326	mdi_client_t	*ct;
1327
1328	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1329
1330	/*
1331	 * Allocate and initialize a component structure.
1332	 */
1333	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1334	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1335	ct->ct_hnext = NULL;
1336	ct->ct_hprev = NULL;
1337	ct->ct_dip = NULL;
1338	ct->ct_vhci = vh;
1339	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1340	(void) strcpy(ct->ct_drvname, name);
1341	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1342	(void) strcpy(ct->ct_guid, lguid);
1343	ct->ct_cprivate = NULL;
1344	ct->ct_vprivate = NULL;
1345	ct->ct_flags = 0;
1346	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1347	MDI_CLIENT_LOCK(ct);
1348	MDI_CLIENT_SET_OFFLINE(ct);
1349	MDI_CLIENT_SET_DETACH(ct);
1350	MDI_CLIENT_SET_POWER_UP(ct);
1351	MDI_CLIENT_UNLOCK(ct);
1352	ct->ct_failover_flags = 0;
1353	ct->ct_failover_status = 0;
1354	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1355	ct->ct_unstable = 0;
1356	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1357	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1358	ct->ct_lb = vh->vh_lb;
1359	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1360	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1361	ct->ct_path_count = 0;
1362	ct->ct_path_head = NULL;
1363	ct->ct_path_tail = NULL;
1364	ct->ct_path_last = NULL;
1365
1366	/*
1367	 * Add this client component to our client hash queue
1368	 */
1369	i_mdi_client_enlist_table(vh, ct);
1370	return (ct);
1371}
1372
1373/*
1374 * i_mdi_client_enlist_table():
1375 *		Attach the client device to the client hash table. Caller
1376 *		should hold the vhci client lock.
1377 */
1378static void
1379i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1380{
1381	int 			index;
1382	struct client_hash	*head;
1383
1384	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1385
1386	index = i_mdi_get_hash_key(ct->ct_guid);
1387	head = &vh->vh_client_table[index];
1388	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1389	head->ct_hash_head = ct;
1390	head->ct_hash_count++;
1391	vh->vh_client_count++;
1392}
1393
1394/*
1395 * i_mdi_client_delist_table():
1396 *		Attach the client device to the client hash table.
1397 *		Caller should hold the vhci client lock.
1398 */
1399static void
1400i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1401{
1402	int			index;
1403	char			*guid;
1404	struct client_hash 	*head;
1405	mdi_client_t		*next;
1406	mdi_client_t		*last;
1407
1408	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1409
1410	guid = ct->ct_guid;
1411	index = i_mdi_get_hash_key(guid);
1412	head = &vh->vh_client_table[index];
1413
1414	last = NULL;
1415	next = (mdi_client_t *)head->ct_hash_head;
1416	while (next != NULL) {
1417		if (next == ct) {
1418			break;
1419		}
1420		last = next;
1421		next = next->ct_hnext;
1422	}
1423
1424	if (next) {
1425		head->ct_hash_count--;
1426		if (last == NULL) {
1427			head->ct_hash_head = ct->ct_hnext;
1428		} else {
1429			last->ct_hnext = ct->ct_hnext;
1430		}
1431		ct->ct_hnext = NULL;
1432		vh->vh_client_count--;
1433	}
1434}
1435
1436
1437/*
1438 * i_mdi_client_free():
1439 *		Free a client component
1440 */
1441static int
1442i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1443{
1444	int		rv = MDI_SUCCESS;
1445	int		flags = ct->ct_flags;
1446	dev_info_t	*cdip;
1447	dev_info_t	*vdip;
1448
1449	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1450
1451	vdip = vh->vh_dip;
1452	cdip = ct->ct_dip;
1453
1454	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1455	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1456	DEVI(cdip)->devi_mdi_client = NULL;
1457
1458	/*
1459	 * Clear out back ref. to dev_info_t node
1460	 */
1461	ct->ct_dip = NULL;
1462
1463	/*
1464	 * Remove this client from our hash queue
1465	 */
1466	i_mdi_client_delist_table(vh, ct);
1467
1468	/*
1469	 * Uninitialize and free the component
1470	 */
1471	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1472	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1473	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1474	cv_destroy(&ct->ct_failover_cv);
1475	cv_destroy(&ct->ct_unstable_cv);
1476	cv_destroy(&ct->ct_powerchange_cv);
1477	mutex_destroy(&ct->ct_mutex);
1478	kmem_free(ct, sizeof (*ct));
1479
1480	if (cdip != NULL) {
1481		MDI_VHCI_CLIENT_UNLOCK(vh);
1482		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1483		MDI_VHCI_CLIENT_LOCK(vh);
1484	}
1485	return (rv);
1486}
1487
1488/*
1489 * i_mdi_client_find():
1490 * 		Find the client structure corresponding to a given guid
1491 *		Caller should hold the vhci client lock.
1492 */
1493static mdi_client_t *
1494i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1495{
1496	int			index;
1497	struct client_hash	*head;
1498	mdi_client_t		*ct;
1499
1500	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1501
1502	index = i_mdi_get_hash_key(guid);
1503	head = &vh->vh_client_table[index];
1504
1505	ct = head->ct_hash_head;
1506	while (ct != NULL) {
1507		if (strcmp(ct->ct_guid, guid) == 0 &&
1508		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1509			break;
1510		}
1511		ct = ct->ct_hnext;
1512	}
1513	return (ct);
1514}
1515
1516/*
1517 * i_mdi_client_update_state():
1518 *		Compute and update client device state
1519 * Notes:
1520 *		A client device can be in any of three possible states:
1521 *
1522 *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1523 *		one online/standby paths. Can tolerate failures.
1524 *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1525 *		no alternate paths available as standby. A failure on the online
1526 *		would result in loss of access to device data.
1527 *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1528 *		no paths available to access the device.
1529 */
1530static void
1531i_mdi_client_update_state(mdi_client_t *ct)
1532{
1533	int state;
1534
1535	ASSERT(MDI_CLIENT_LOCKED(ct));
1536	state = i_mdi_client_compute_state(ct, NULL);
1537	MDI_CLIENT_SET_STATE(ct, state);
1538}
1539
1540/*
1541 * i_mdi_client_compute_state():
1542 *		Compute client device state
1543 *
1544 *		mdi_phci_t *	Pointer to pHCI structure which should
1545 *				while computing the new value.  Used by
1546 *				i_mdi_phci_offline() to find the new
1547 *				client state after DR of a pHCI.
1548 */
1549static int
1550i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1551{
1552	int		state;
1553	int		online_count = 0;
1554	int		standby_count = 0;
1555	mdi_pathinfo_t	*pip, *next;
1556
1557	ASSERT(MDI_CLIENT_LOCKED(ct));
1558	pip = ct->ct_path_head;
1559	while (pip != NULL) {
1560		MDI_PI_LOCK(pip);
1561		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1562		if (MDI_PI(pip)->pi_phci == ph) {
1563			MDI_PI_UNLOCK(pip);
1564			pip = next;
1565			continue;
1566		}
1567
1568		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1569				== MDI_PATHINFO_STATE_ONLINE)
1570			online_count++;
1571		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1572				== MDI_PATHINFO_STATE_STANDBY)
1573			standby_count++;
1574		MDI_PI_UNLOCK(pip);
1575		pip = next;
1576	}
1577
1578	if (online_count == 0) {
1579		if (standby_count == 0) {
1580			state = MDI_CLIENT_STATE_FAILED;
1581			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1582			    "client state failed: ct = %p", (void *)ct));
1583		} else if (standby_count == 1) {
1584			state = MDI_CLIENT_STATE_DEGRADED;
1585		} else {
1586			state = MDI_CLIENT_STATE_OPTIMAL;
1587		}
1588	} else if (online_count == 1) {
1589		if (standby_count == 0) {
1590			state = MDI_CLIENT_STATE_DEGRADED;
1591		} else {
1592			state = MDI_CLIENT_STATE_OPTIMAL;
1593		}
1594	} else {
1595		state = MDI_CLIENT_STATE_OPTIMAL;
1596	}
1597	return (state);
1598}
1599
1600/*
1601 * i_mdi_client2devinfo():
1602 *		Utility function
1603 */
1604dev_info_t *
1605i_mdi_client2devinfo(mdi_client_t *ct)
1606{
1607	return (ct->ct_dip);
1608}
1609
1610/*
1611 * mdi_client_path2_devinfo():
1612 * 		Given the parent devinfo and child devfs pathname, search for
1613 *		a valid devfs node handle.
1614 */
1615dev_info_t *
1616mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1617{
1618	dev_info_t 	*cdip = NULL;
1619	dev_info_t 	*ndip = NULL;
1620	char		*temp_pathname;
1621	int		circular;
1622
1623	/*
1624	 * Allocate temp buffer
1625	 */
1626	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1627
1628	/*
1629	 * Lock parent against changes
1630	 */
1631	ndi_devi_enter(vdip, &circular);
1632	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1633	while ((cdip = ndip) != NULL) {
1634		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1635
1636		*temp_pathname = '\0';
1637		(void) ddi_pathname(cdip, temp_pathname);
1638		if (strcmp(temp_pathname, pathname) == 0) {
1639			break;
1640		}
1641	}
1642	/*
1643	 * Release devinfo lock
1644	 */
1645	ndi_devi_exit(vdip, circular);
1646
1647	/*
1648	 * Free the temp buffer
1649	 */
1650	kmem_free(temp_pathname, MAXPATHLEN);
1651	return (cdip);
1652}
1653
1654/*
1655 * mdi_client_get_path_count():
1656 * 		Utility function to get number of path information nodes
1657 *		associated with a given client device.
1658 */
1659int
1660mdi_client_get_path_count(dev_info_t *cdip)
1661{
1662	mdi_client_t	*ct;
1663	int		count = 0;
1664
1665	ct = i_devi_get_client(cdip);
1666	if (ct != NULL) {
1667		count = ct->ct_path_count;
1668	}
1669	return (count);
1670}
1671
1672
1673/*
1674 * i_mdi_get_hash_key():
1675 * 		Create a hash using strings as keys
1676 *
1677 */
1678static int
1679i_mdi_get_hash_key(char *str)
1680{
1681	uint32_t	g, hash = 0;
1682	char		*p;
1683
1684	for (p = str; *p != '\0'; p++) {
1685		g = *p;
1686		hash += g;
1687	}
1688	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1689}
1690
1691/*
1692 * mdi_get_lb_policy():
1693 * 		Get current load balancing policy for a given client device
1694 */
1695client_lb_t
1696mdi_get_lb_policy(dev_info_t *cdip)
1697{
1698	client_lb_t	lb = LOAD_BALANCE_NONE;
1699	mdi_client_t	*ct;
1700
1701	ct = i_devi_get_client(cdip);
1702	if (ct != NULL) {
1703		lb = ct->ct_lb;
1704	}
1705	return (lb);
1706}
1707
1708/*
1709 * mdi_set_lb_region_size():
1710 * 		Set current region size for the load-balance
1711 */
1712int
1713mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1714{
1715	mdi_client_t	*ct;
1716	int		rv = MDI_FAILURE;
1717
1718	ct = i_devi_get_client(cdip);
1719	if (ct != NULL && ct->ct_lb_args != NULL) {
1720		ct->ct_lb_args->region_size = region_size;
1721		rv = MDI_SUCCESS;
1722	}
1723	return (rv);
1724}
1725
1726/*
1727 * mdi_Set_lb_policy():
1728 * 		Set current load balancing policy for a given client device
1729 */
1730int
1731mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1732{
1733	mdi_client_t	*ct;
1734	int		rv = MDI_FAILURE;
1735
1736	ct = i_devi_get_client(cdip);
1737	if (ct != NULL) {
1738		ct->ct_lb = lb;
1739		rv = MDI_SUCCESS;
1740	}
1741	return (rv);
1742}
1743
1744/*
1745 * mdi_failover():
1746 *		failover function called by the vHCI drivers to initiate
1747 *		a failover operation.  This is typically due to non-availability
1748 *		of online paths to route I/O requests.  Failover can be
1749 *		triggered through user application also.
1750 *
1751 *		The vHCI driver calls mdi_failover() to initiate a failover
1752 *		operation. mdi_failover() calls back into the vHCI driver's
1753 *		vo_failover() entry point to perform the actual failover
1754 *		operation.  The reason for requiring the vHCI driver to
1755 *		initiate failover by calling mdi_failover(), instead of directly
1756 *		executing vo_failover() itself, is to ensure that the mdi
1757 *		framework can keep track of the client state properly.
1758 *		Additionally, mdi_failover() provides as a convenience the
1759 *		option of performing the failover operation synchronously or
1760 *		asynchronously
1761 *
1762 *		Upon successful completion of the failover operation, the
1763 *		paths that were previously ONLINE will be in the STANDBY state,
1764 *		and the newly activated paths will be in the ONLINE state.
1765 *
1766 *		The flags modifier determines whether the activation is done
1767 *		synchronously: MDI_FAILOVER_SYNC
1768 * Return Values:
1769 *		MDI_SUCCESS
1770 *		MDI_FAILURE
1771 *		MDI_BUSY
1772 */
1773/*ARGSUSED*/
1774int
1775mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1776{
1777	int			rv;
1778	mdi_client_t		*ct;
1779
1780	ct = i_devi_get_client(cdip);
1781	ASSERT(ct != NULL);
1782	if (ct == NULL) {
1783		/* cdip is not a valid client device. Nothing more to do. */
1784		return (MDI_FAILURE);
1785	}
1786
1787	MDI_CLIENT_LOCK(ct);
1788
1789	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1790		/* A path to the client is being freed */
1791		MDI_CLIENT_UNLOCK(ct);
1792		return (MDI_BUSY);
1793	}
1794
1795
1796	if (MDI_CLIENT_IS_FAILED(ct)) {
1797		/*
1798		 * Client is in failed state. Nothing more to do.
1799		 */
1800		MDI_CLIENT_UNLOCK(ct);
1801		return (MDI_FAILURE);
1802	}
1803
1804	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1805		/*
1806		 * Failover is already in progress; return BUSY
1807		 */
1808		MDI_CLIENT_UNLOCK(ct);
1809		return (MDI_BUSY);
1810	}
1811	/*
1812	 * Make sure that mdi_pathinfo node state changes are processed.
1813	 * We do not allow failovers to progress while client path state
1814	 * changes are in progress
1815	 */
1816	if (ct->ct_unstable) {
1817		if (flags == MDI_FAILOVER_ASYNC) {
1818			MDI_CLIENT_UNLOCK(ct);
1819			return (MDI_BUSY);
1820		} else {
1821			while (ct->ct_unstable)
1822				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1823		}
1824	}
1825
1826	/*
1827	 * Client device is in stable state. Before proceeding, perform sanity
1828	 * checks again.
1829	 */
1830	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1831	    (!i_ddi_devi_attached(cdip))) {
1832		/*
1833		 * Client is in failed state. Nothing more to do.
1834		 */
1835		MDI_CLIENT_UNLOCK(ct);
1836		return (MDI_FAILURE);
1837	}
1838
1839	/*
1840	 * Set the client state as failover in progress.
1841	 */
1842	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1843	ct->ct_failover_flags = flags;
1844	MDI_CLIENT_UNLOCK(ct);
1845
1846	if (flags == MDI_FAILOVER_ASYNC) {
1847		/*
1848		 * Submit the initiate failover request via CPR safe
1849		 * taskq threads.
1850		 */
1851		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1852		    ct, KM_SLEEP);
1853		return (MDI_ACCEPT);
1854	} else {
1855		/*
1856		 * Synchronous failover mode.  Typically invoked from the user
1857		 * land.
1858		 */
1859		rv = i_mdi_failover(ct);
1860	}
1861	return (rv);
1862}
1863
1864/*
1865 * i_mdi_failover():
1866 *		internal failover function. Invokes vHCI drivers failover
1867 *		callback function and process the failover status
1868 * Return Values:
1869 *		None
1870 *
1871 * Note: A client device in failover state can not be detached or freed.
1872 */
1873static int
1874i_mdi_failover(void *arg)
1875{
1876	int		rv = MDI_SUCCESS;
1877	mdi_client_t	*ct = (mdi_client_t *)arg;
1878	mdi_vhci_t	*vh = ct->ct_vhci;
1879
1880	ASSERT(!MDI_CLIENT_LOCKED(ct));
1881
1882	if (vh->vh_ops->vo_failover != NULL) {
1883		/*
1884		 * Call vHCI drivers callback routine
1885		 */
1886		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1887		    ct->ct_failover_flags);
1888	}
1889
1890	MDI_CLIENT_LOCK(ct);
1891	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1892
1893	/*
1894	 * Save the failover return status
1895	 */
1896	ct->ct_failover_status = rv;
1897
1898	/*
1899	 * As a result of failover, client status would have been changed.
1900	 * Update the client state and wake up anyone waiting on this client
1901	 * device.
1902	 */
1903	i_mdi_client_update_state(ct);
1904
1905	cv_broadcast(&ct->ct_failover_cv);
1906	MDI_CLIENT_UNLOCK(ct);
1907	return (rv);
1908}
1909
1910/*
1911 * Load balancing is logical block.
1912 * IOs within the range described by region_size
1913 * would go on the same path. This would improve the
1914 * performance by cache-hit on some of the RAID devices.
1915 * Search only for online paths(At some point we
1916 * may want to balance across target ports).
1917 * If no paths are found then default to round-robin.
1918 */
1919static int
1920i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1921{
1922	int		path_index = -1;
1923	int		online_path_count = 0;
1924	int		online_nonpref_path_count = 0;
1925	int 		region_size = ct->ct_lb_args->region_size;
1926	mdi_pathinfo_t	*pip;
1927	mdi_pathinfo_t	*next;
1928	int		preferred, path_cnt;
1929
1930	pip = ct->ct_path_head;
1931	while (pip) {
1932		MDI_PI_LOCK(pip);
1933		if (MDI_PI(pip)->pi_state ==
1934		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1935			online_path_count++;
1936		} else if (MDI_PI(pip)->pi_state ==
1937		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1938			online_nonpref_path_count++;
1939		}
1940		next = (mdi_pathinfo_t *)
1941		    MDI_PI(pip)->pi_client_link;
1942		MDI_PI_UNLOCK(pip);
1943		pip = next;
1944	}
1945	/* if found any online/preferred then use this type */
1946	if (online_path_count > 0) {
1947		path_cnt = online_path_count;
1948		preferred = 1;
1949	} else if (online_nonpref_path_count > 0) {
1950		path_cnt = online_nonpref_path_count;
1951		preferred = 0;
1952	} else {
1953		path_cnt = 0;
1954	}
1955	if (path_cnt) {
1956		path_index = (bp->b_blkno >> region_size) % path_cnt;
1957		pip = ct->ct_path_head;
1958		while (pip && path_index != -1) {
1959			MDI_PI_LOCK(pip);
1960			if (path_index == 0 &&
1961			    (MDI_PI(pip)->pi_state ==
1962			    MDI_PATHINFO_STATE_ONLINE) &&
1963				MDI_PI(pip)->pi_preferred == preferred) {
1964				MDI_PI_HOLD(pip);
1965				MDI_PI_UNLOCK(pip);
1966				*ret_pip = pip;
1967				return (MDI_SUCCESS);
1968			}
1969			path_index --;
1970			next = (mdi_pathinfo_t *)
1971			    MDI_PI(pip)->pi_client_link;
1972			MDI_PI_UNLOCK(pip);
1973			pip = next;
1974		}
1975		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1976		    "lba %llx: path %s %p",
1977		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1978	}
1979	return (MDI_FAILURE);
1980}
1981
1982/*
1983 * mdi_select_path():
1984 *		select a path to access a client device.
1985 *
1986 *		mdi_select_path() function is called by the vHCI drivers to
1987 *		select a path to route the I/O request to.  The caller passes
1988 *		the block I/O data transfer structure ("buf") as one of the
1989 *		parameters.  The mpxio framework uses the buf structure
1990 *		contents to maintain per path statistics (total I/O size /
1991 *		count pending).  If more than one online paths are available to
1992 *		select, the framework automatically selects a suitable path
1993 *		for routing I/O request. If a failover operation is active for
1994 *		this client device the call shall be failed with MDI_BUSY error
1995 *		code.
1996 *
1997 *		By default this function returns a suitable path in online
1998 *		state based on the current load balancing policy.  Currently
1999 *		we support LOAD_BALANCE_NONE (Previously selected online path
2000 *		will continue to be used till the path is usable) and
2001 *		LOAD_BALANCE_RR (Online paths will be selected in a round
2002 *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2003 *		based on the logical block).  The load balancing
2004 *		through vHCI drivers configuration file (driver.conf).
2005 *
2006 *		vHCI drivers may override this default behavior by specifying
2007 *		appropriate flags.  The meaning of the thrid argument depends
2008 *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2009 *		then the argument is the "path instance" of the path to select.
2010 *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2011 *		"start_pip". A non NULL "start_pip" is the starting point to
2012 *		walk and find the next appropriate path.  The following values
2013 *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2014 *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2015 *		STANDBY path).
2016 *
2017 *		The non-standard behavior is used by the scsi_vhci driver,
2018 *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2019 *		attach of client devices (to avoid an unnecessary failover
2020 *		when the STANDBY path comes up first), during failover
2021 *		(to activate a STANDBY path as ONLINE).
2022 *
2023 *		The selected path is returned in a a mdi_hold_path() state
2024 *		(pi_ref_cnt). Caller should release the hold by calling
2025 *		mdi_rele_path().
2026 *
2027 * Return Values:
2028 *		MDI_SUCCESS	- Completed successfully
2029 *		MDI_BUSY 	- Client device is busy failing over
2030 *		MDI_NOPATH	- Client device is online, but no valid path are
2031 *				  available to access this client device
2032 *		MDI_FAILURE	- Invalid client device or state
2033 *		MDI_DEVI_ONLINING
2034 *				- Client device (struct dev_info state) is in
2035 *				  onlining state.
2036 */
2037
2038/*ARGSUSED*/
2039int
2040mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2041    void *arg, mdi_pathinfo_t **ret_pip)
2042{
2043	mdi_client_t	*ct;
2044	mdi_pathinfo_t	*pip;
2045	mdi_pathinfo_t	*next;
2046	mdi_pathinfo_t	*head;
2047	mdi_pathinfo_t	*start;
2048	client_lb_t	lbp;	/* load balancing policy */
2049	int		sb = 1;	/* standard behavior */
2050	int		preferred = 1;	/* preferred path */
2051	int		cond, cont = 1;
2052	int		retry = 0;
2053	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2054	int		path_instance;	/* request specific path instance */
2055
2056	/* determine type of arg based on flags */
2057	if (flags & MDI_SELECT_PATH_INSTANCE) {
2058		path_instance = (int)(intptr_t)arg;
2059		start_pip = NULL;
2060	} else {
2061		path_instance = 0;
2062		start_pip = (mdi_pathinfo_t *)arg;
2063	}
2064
2065	if (flags != 0) {
2066		/*
2067		 * disable default behavior
2068		 */
2069		sb = 0;
2070	}
2071
2072	*ret_pip = NULL;
2073	ct = i_devi_get_client(cdip);
2074	if (ct == NULL) {
2075		/* mdi extensions are NULL, Nothing more to do */
2076		return (MDI_FAILURE);
2077	}
2078
2079	MDI_CLIENT_LOCK(ct);
2080
2081	if (sb) {
2082		if (MDI_CLIENT_IS_FAILED(ct)) {
2083			/*
2084			 * Client is not ready to accept any I/O requests.
2085			 * Fail this request.
2086			 */
2087			MDI_DEBUG(2, (MDI_NOTE, cdip,
2088			    "client state offline ct = %p", (void *)ct));
2089			MDI_CLIENT_UNLOCK(ct);
2090			return (MDI_FAILURE);
2091		}
2092
2093		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2094			/*
2095			 * Check for Failover is in progress. If so tell the
2096			 * caller that this device is busy.
2097			 */
2098			MDI_DEBUG(2, (MDI_NOTE, cdip,
2099			    "client failover in progress ct = %p",
2100			    (void *)ct));
2101			MDI_CLIENT_UNLOCK(ct);
2102			return (MDI_BUSY);
2103		}
2104
2105		/*
2106		 * Check to see whether the client device is attached.
2107		 * If not so, let the vHCI driver manually select a path
2108		 * (standby) and let the probe/attach process to continue.
2109		 */
2110		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2111			MDI_DEBUG(4, (MDI_NOTE, cdip,
2112			    "devi is onlining ct = %p", (void *)ct));
2113			MDI_CLIENT_UNLOCK(ct);
2114			return (MDI_DEVI_ONLINING);
2115		}
2116	}
2117
2118	/*
2119	 * Cache in the client list head.  If head of the list is NULL
2120	 * return MDI_NOPATH
2121	 */
2122	head = ct->ct_path_head;
2123	if (head == NULL) {
2124		MDI_CLIENT_UNLOCK(ct);
2125		return (MDI_NOPATH);
2126	}
2127
2128	/* Caller is specifying a specific pathinfo path by path_instance */
2129	if (path_instance) {
2130		/* search for pathinfo with correct path_instance */
2131		for (pip = head;
2132		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2133		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2134			;
2135
2136		/* If path can't be selected then MDI_NOPATH is returned. */
2137		if (pip == NULL) {
2138			MDI_CLIENT_UNLOCK(ct);
2139			return (MDI_NOPATH);
2140		}
2141
2142		/*
2143		 * Verify state of path. When asked to select a specific
2144		 * path_instance, we select the requested path in any
2145		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2146		 * We don't however select paths where the pHCI has detached.
2147		 * NOTE: last pathinfo node of an opened client device may
2148		 * exist in an OFFLINE state after the pHCI associated with
2149		 * that path has detached (but pi_phci will be NULL if that
2150		 * has occurred).
2151		 */
2152		MDI_PI_LOCK(pip);
2153		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2154		    (MDI_PI(pip)->pi_phci == NULL)) {
2155			MDI_PI_UNLOCK(pip);
2156			MDI_CLIENT_UNLOCK(ct);
2157			return (MDI_FAILURE);
2158		}
2159
2160		/* Return MDI_BUSY if we have a transient condition */
2161		if (MDI_PI_IS_TRANSIENT(pip)) {
2162			MDI_PI_UNLOCK(pip);
2163			MDI_CLIENT_UNLOCK(ct);
2164			return (MDI_BUSY);
2165		}
2166
2167		/*
2168		 * Return the path in hold state. Caller should release the
2169		 * lock by calling mdi_rele_path()
2170		 */
2171		MDI_PI_HOLD(pip);
2172		MDI_PI_UNLOCK(pip);
2173		*ret_pip = pip;
2174		MDI_CLIENT_UNLOCK(ct);
2175		return (MDI_SUCCESS);
2176	}
2177
2178	/*
2179	 * for non default behavior, bypass current
2180	 * load balancing policy and always use LOAD_BALANCE_RR
2181	 * except that the start point will be adjusted based
2182	 * on the provided start_pip
2183	 */
2184	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2185
2186	switch (lbp) {
2187	case LOAD_BALANCE_NONE:
2188		/*
2189		 * Load balancing is None  or Alternate path mode
2190		 * Start looking for a online mdi_pathinfo node starting from
2191		 * last known selected path
2192		 */
2193		preferred = 1;
2194		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2195		if (pip == NULL) {
2196			pip = head;
2197		}
2198		start = pip;
2199		do {
2200			MDI_PI_LOCK(pip);
2201			/*
2202			 * No need to explicitly check if the path is disabled.
2203			 * Since we are checking for state == ONLINE and the
2204			 * same variable is used for DISABLE/ENABLE information.
2205			 */
2206			if ((MDI_PI(pip)->pi_state  ==
2207				MDI_PATHINFO_STATE_ONLINE) &&
2208				preferred == MDI_PI(pip)->pi_preferred) {
2209				/*
2210				 * Return the path in hold state. Caller should
2211				 * release the lock by calling mdi_rele_path()
2212				 */
2213				MDI_PI_HOLD(pip);
2214				MDI_PI_UNLOCK(pip);
2215				ct->ct_path_last = pip;
2216				*ret_pip = pip;
2217				MDI_CLIENT_UNLOCK(ct);
2218				return (MDI_SUCCESS);
2219			}
2220
2221			/*
2222			 * Path is busy.
2223			 */
2224			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2225			    MDI_PI_IS_TRANSIENT(pip))
2226				retry = 1;
2227			/*
2228			 * Keep looking for a next available online path
2229			 */
2230			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2231			if (next == NULL) {
2232				next = head;
2233			}
2234			MDI_PI_UNLOCK(pip);
2235			pip = next;
2236			if (start == pip && preferred) {
2237				preferred = 0;
2238			} else if (start == pip && !preferred) {
2239				cont = 0;
2240			}
2241		} while (cont);
2242		break;
2243
2244	case LOAD_BALANCE_LBA:
2245		/*
2246		 * Make sure we are looking
2247		 * for an online path. Otherwise, if it is for a STANDBY
2248		 * path request, it will go through and fetch an ONLINE
2249		 * path which is not desirable.
2250		 */
2251		if ((ct->ct_lb_args != NULL) &&
2252			    (ct->ct_lb_args->region_size) && bp &&
2253				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2254			if (i_mdi_lba_lb(ct, ret_pip, bp)
2255				    == MDI_SUCCESS) {
2256				MDI_CLIENT_UNLOCK(ct);
2257				return (MDI_SUCCESS);
2258			}
2259		}
2260		/* FALLTHROUGH */
2261	case LOAD_BALANCE_RR:
2262		/*
2263		 * Load balancing is Round Robin. Start looking for a online
2264		 * mdi_pathinfo node starting from last known selected path
2265		 * as the start point.  If override flags are specified,
2266		 * process accordingly.
2267		 * If the search is already in effect(start_pip not null),
2268		 * then lets just use the same path preference to continue the
2269		 * traversal.
2270		 */
2271
2272		if (start_pip != NULL) {
2273			preferred = MDI_PI(start_pip)->pi_preferred;
2274		} else {
2275			preferred = 1;
2276		}
2277
2278		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2279		if (start == NULL) {
2280			pip = head;
2281		} else {
2282			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2283			if (pip == NULL) {
2284				if ( flags & MDI_SELECT_NO_PREFERRED) {
2285					/*
2286					 * Return since we hit the end of list
2287					 */
2288					MDI_CLIENT_UNLOCK(ct);
2289					return (MDI_NOPATH);
2290				}
2291
2292				if (!sb) {
2293					if (preferred == 0) {
2294						/*
2295						 * Looks like we have completed
2296						 * the traversal as preferred
2297						 * value is 0. Time to bail out.
2298						 */
2299						*ret_pip = NULL;
2300						MDI_CLIENT_UNLOCK(ct);
2301						return (MDI_NOPATH);
2302					} else {
2303						/*
2304						 * Looks like we reached the
2305						 * end of the list. Lets enable
2306						 * traversal of non preferred
2307						 * paths.
2308						 */
2309						preferred = 0;
2310					}
2311				}
2312				pip = head;
2313			}
2314		}
2315		start = pip;
2316		do {
2317			MDI_PI_LOCK(pip);
2318			if (sb) {
2319				cond = ((MDI_PI(pip)->pi_state ==
2320				    MDI_PATHINFO_STATE_ONLINE &&
2321					MDI_PI(pip)->pi_preferred ==
2322						preferred) ? 1 : 0);
2323			} else {
2324				if (flags == MDI_SELECT_ONLINE_PATH) {
2325					cond = ((MDI_PI(pip)->pi_state ==
2326					    MDI_PATHINFO_STATE_ONLINE &&
2327						MDI_PI(pip)->pi_preferred ==
2328						preferred) ? 1 : 0);
2329				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2330					cond = ((MDI_PI(pip)->pi_state ==
2331					    MDI_PATHINFO_STATE_STANDBY &&
2332						MDI_PI(pip)->pi_preferred ==
2333						preferred) ? 1 : 0);
2334				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2335				    MDI_SELECT_STANDBY_PATH)) {
2336					cond = (((MDI_PI(pip)->pi_state ==
2337					    MDI_PATHINFO_STATE_ONLINE ||
2338					    (MDI_PI(pip)->pi_state ==
2339					    MDI_PATHINFO_STATE_STANDBY)) &&
2340						MDI_PI(pip)->pi_preferred ==
2341						preferred) ? 1 : 0);
2342				} else if (flags ==
2343					(MDI_SELECT_STANDBY_PATH |
2344					MDI_SELECT_ONLINE_PATH |
2345					MDI_SELECT_USER_DISABLE_PATH)) {
2346					cond = (((MDI_PI(pip)->pi_state ==
2347					    MDI_PATHINFO_STATE_ONLINE ||
2348					    (MDI_PI(pip)->pi_state ==
2349					    MDI_PATHINFO_STATE_STANDBY) ||
2350						(MDI_PI(pip)->pi_state ==
2351					    (MDI_PATHINFO_STATE_ONLINE|
2352					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2353						(MDI_PI(pip)->pi_state ==
2354					    (MDI_PATHINFO_STATE_STANDBY |
2355					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2356						MDI_PI(pip)->pi_preferred ==
2357						preferred) ? 1 : 0);
2358				} else if (flags ==
2359				    (MDI_SELECT_STANDBY_PATH |
2360				    MDI_SELECT_ONLINE_PATH |
2361				    MDI_SELECT_NO_PREFERRED)) {
2362					cond = (((MDI_PI(pip)->pi_state ==
2363					    MDI_PATHINFO_STATE_ONLINE) ||
2364					    (MDI_PI(pip)->pi_state ==
2365					    MDI_PATHINFO_STATE_STANDBY))
2366					    ? 1 : 0);
2367				} else {
2368					cond = 0;
2369				}
2370			}
2371			/*
2372			 * No need to explicitly check if the path is disabled.
2373			 * Since we are checking for state == ONLINE and the
2374			 * same variable is used for DISABLE/ENABLE information.
2375			 */
2376			if (cond) {
2377				/*
2378				 * Return the path in hold state. Caller should
2379				 * release the lock by calling mdi_rele_path()
2380				 */
2381				MDI_PI_HOLD(pip);
2382				MDI_PI_UNLOCK(pip);
2383				if (sb)
2384					ct->ct_path_last = pip;
2385				*ret_pip = pip;
2386				MDI_CLIENT_UNLOCK(ct);
2387				return (MDI_SUCCESS);
2388			}
2389			/*
2390			 * Path is busy.
2391			 */
2392			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2393			    MDI_PI_IS_TRANSIENT(pip))
2394				retry = 1;
2395
2396			/*
2397			 * Keep looking for a next available online path
2398			 */
2399do_again:
2400			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2401			if (next == NULL) {
2402				if ( flags & MDI_SELECT_NO_PREFERRED) {
2403					/*
2404					 * Bail out since we hit the end of list
2405					 */
2406					MDI_PI_UNLOCK(pip);
2407					break;
2408				}
2409
2410				if (!sb) {
2411					if (preferred == 1) {
2412						/*
2413						 * Looks like we reached the
2414						 * end of the list. Lets enable
2415						 * traversal of non preferred
2416						 * paths.
2417						 */
2418						preferred = 0;
2419						next = head;
2420					} else {
2421						/*
2422						 * We have done both the passes
2423						 * Preferred as well as for
2424						 * Non-preferred. Bail out now.
2425						 */
2426						cont = 0;
2427					}
2428				} else {
2429					/*
2430					 * Standard behavior case.
2431					 */
2432					next = head;
2433				}
2434			}
2435			MDI_PI_UNLOCK(pip);
2436			if (cont == 0) {
2437				break;
2438			}
2439			pip = next;
2440
2441			if (!sb) {
2442				/*
2443				 * We need to handle the selection of
2444				 * non-preferred path in the following
2445				 * case:
2446				 *
2447				 * +------+   +------+   +------+   +-----+
2448				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2449				 * +------+   +------+   +------+   +-----+
2450				 *
2451				 * If we start the search with B, we need to
2452				 * skip beyond B to pick C which is non -
2453				 * preferred in the second pass. The following
2454				 * test, if true, will allow us to skip over
2455				 * the 'start'(B in the example) to select
2456				 * other non preferred elements.
2457				 */
2458				if ((start_pip != NULL) && (start_pip == pip) &&
2459				    (MDI_PI(start_pip)->pi_preferred
2460				    != preferred)) {
2461					/*
2462					 * try again after going past the start
2463					 * pip
2464					 */
2465					MDI_PI_LOCK(pip);
2466					goto do_again;
2467				}
2468			} else {
2469				/*
2470				 * Standard behavior case
2471				 */
2472				if (start == pip && preferred) {
2473					/* look for nonpreferred paths */
2474					preferred = 0;
2475				} else if (start == pip && !preferred) {
2476					/*
2477					 * Exit condition
2478					 */
2479					cont = 0;
2480				}
2481			}
2482		} while (cont);
2483		break;
2484	}
2485
2486	MDI_CLIENT_UNLOCK(ct);
2487	if (retry == 1) {
2488		return (MDI_BUSY);
2489	} else {
2490		return (MDI_NOPATH);
2491	}
2492}
2493
2494/*
2495 * For a client, return the next available path to any phci
2496 *
2497 * Note:
2498 *		Caller should hold the branch's devinfo node to get a consistent
2499 *		snap shot of the mdi_pathinfo nodes.
2500 *
2501 *		Please note that even the list is stable the mdi_pathinfo
2502 *		node state and properties are volatile.  The caller should lock
2503 *		and unlock the nodes by calling mdi_pi_lock() and
2504 *		mdi_pi_unlock() functions to get a stable properties.
2505 *
2506 *		If there is a need to use the nodes beyond the hold of the
2507 *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2508 *		need to be held against unexpected removal by calling
2509 *		mdi_hold_path() and should be released by calling
2510 *		mdi_rele_path() on completion.
2511 */
2512mdi_pathinfo_t *
2513mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2514{
2515	mdi_client_t *ct;
2516
2517	if (!MDI_CLIENT(ct_dip))
2518		return (NULL);
2519
2520	/*
2521	 * Walk through client link
2522	 */
2523	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2524	ASSERT(ct != NULL);
2525
2526	if (pip == NULL)
2527		return ((mdi_pathinfo_t *)ct->ct_path_head);
2528
2529	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2530}
2531
2532/*
2533 * For a phci, return the next available path to any client
2534 * Note: ditto mdi_get_next_phci_path()
2535 */
2536mdi_pathinfo_t *
2537mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2538{
2539	mdi_phci_t *ph;
2540
2541	if (!MDI_PHCI(ph_dip))
2542		return (NULL);
2543
2544	/*
2545	 * Walk through pHCI link
2546	 */
2547	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2548	ASSERT(ph != NULL);
2549
2550	if (pip == NULL)
2551		return ((mdi_pathinfo_t *)ph->ph_path_head);
2552
2553	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2554}
2555
2556/*
2557 * mdi_hold_path():
2558 *		Hold the mdi_pathinfo node against unwanted unexpected free.
2559 * Return Values:
2560 *		None
2561 */
2562void
2563mdi_hold_path(mdi_pathinfo_t *pip)
2564{
2565	if (pip) {
2566		MDI_PI_LOCK(pip);
2567		MDI_PI_HOLD(pip);
2568		MDI_PI_UNLOCK(pip);
2569	}
2570}
2571
2572
2573/*
2574 * mdi_rele_path():
2575 *		Release the mdi_pathinfo node which was selected
2576 *		through mdi_select_path() mechanism or manually held by
2577 *		calling mdi_hold_path().
2578 * Return Values:
2579 *		None
2580 */
2581void
2582mdi_rele_path(mdi_pathinfo_t *pip)
2583{
2584	if (pip) {
2585		MDI_PI_LOCK(pip);
2586		MDI_PI_RELE(pip);
2587		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2588			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2589		}
2590		MDI_PI_UNLOCK(pip);
2591	}
2592}
2593
2594/*
2595 * mdi_pi_lock():
2596 * 		Lock the mdi_pathinfo node.
2597 * Note:
2598 *		The caller should release the lock by calling mdi_pi_unlock()
2599 */
2600void
2601mdi_pi_lock(mdi_pathinfo_t *pip)
2602{
2603	ASSERT(pip != NULL);
2604	if (pip) {
2605		MDI_PI_LOCK(pip);
2606	}
2607}
2608
2609
2610/*
2611 * mdi_pi_unlock():
2612 * 		Unlock the mdi_pathinfo node.
2613 * Note:
2614 *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2615 */
2616void
2617mdi_pi_unlock(mdi_pathinfo_t *pip)
2618{
2619	ASSERT(pip != NULL);
2620	if (pip) {
2621		MDI_PI_UNLOCK(pip);
2622	}
2623}
2624
2625/*
2626 * mdi_pi_find():
2627 *		Search the list of mdi_pathinfo nodes attached to the
2628 *		pHCI/Client device node whose path address matches "paddr".
2629 *		Returns a pointer to the mdi_pathinfo node if a matching node is
2630 *		found.
2631 * Return Values:
2632 *		mdi_pathinfo node handle
2633 *		NULL
2634 * Notes:
2635 *		Caller need not hold any locks to call this function.
2636 */
2637mdi_pathinfo_t *
2638mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2639{
2640	mdi_phci_t		*ph;
2641	mdi_vhci_t		*vh;
2642	mdi_client_t		*ct;
2643	mdi_pathinfo_t		*pip = NULL;
2644
2645	MDI_DEBUG(2, (MDI_NOTE, pdip,
2646	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2647	if ((pdip == NULL) || (paddr == NULL)) {
2648		return (NULL);
2649	}
2650	ph = i_devi_get_phci(pdip);
2651	if (ph == NULL) {
2652		/*
2653		 * Invalid pHCI device, Nothing more to do.
2654		 */
2655		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2656		return (NULL);
2657	}
2658
2659	vh = ph->ph_vhci;
2660	if (vh == NULL) {
2661		/*
2662		 * Invalid vHCI device, Nothing more to do.
2663		 */
2664		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2665		return (NULL);
2666	}
2667
2668	/*
2669	 * Look for pathinfo node identified by paddr.
2670	 */
2671	if (caddr == NULL) {
2672		/*
2673		 * Find a mdi_pathinfo node under pHCI list for a matching
2674		 * unit address.
2675		 */
2676		MDI_PHCI_LOCK(ph);
2677		if (MDI_PHCI_IS_OFFLINE(ph)) {
2678			MDI_DEBUG(2, (MDI_WARN, pdip,
2679			    "offline phci %p", (void *)ph));
2680			MDI_PHCI_UNLOCK(ph);
2681			return (NULL);
2682		}
2683		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2684
2685		while (pip != NULL) {
2686			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2687				break;
2688			}
2689			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2690		}
2691		MDI_PHCI_UNLOCK(ph);
2692		MDI_DEBUG(2, (MDI_NOTE, pdip,
2693		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2694		return (pip);
2695	}
2696
2697	/*
2698	 * XXX - Is the rest of the code in this function really necessary?
2699	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2700	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2701	 * whether the search is based on the pathinfo nodes attached to
2702	 * the pHCI or the client node, the result will be the same.
2703	 */
2704
2705	/*
2706	 * Find the client device corresponding to 'caddr'
2707	 */
2708	MDI_VHCI_CLIENT_LOCK(vh);
2709
2710	/*
2711	 * XXX - Passing NULL to the following function works as long as the
2712	 * the client addresses (caddr) are unique per vhci basis.
2713	 */
2714	ct = i_mdi_client_find(vh, NULL, caddr);
2715	if (ct == NULL) {
2716		/*
2717		 * Client not found, Obviously mdi_pathinfo node has not been
2718		 * created yet.
2719		 */
2720		MDI_VHCI_CLIENT_UNLOCK(vh);
2721		MDI_DEBUG(2, (MDI_NOTE, pdip,
2722		    "client not found for caddr @%s", caddr ? caddr : ""));
2723		return (NULL);
2724	}
2725
2726	/*
2727	 * Hold the client lock and look for a mdi_pathinfo node with matching
2728	 * pHCI and paddr
2729	 */
2730	MDI_CLIENT_LOCK(ct);
2731
2732	/*
2733	 * Release the global mutex as it is no more needed. Note: We always
2734	 * respect the locking order while acquiring.
2735	 */
2736	MDI_VHCI_CLIENT_UNLOCK(vh);
2737
2738	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2739	while (pip != NULL) {
2740		/*
2741		 * Compare the unit address
2742		 */
2743		if ((MDI_PI(pip)->pi_phci == ph) &&
2744		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2745			break;
2746		}
2747		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2748	}
2749	MDI_CLIENT_UNLOCK(ct);
2750	MDI_DEBUG(2, (MDI_NOTE, pdip,
2751	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2752	return (pip);
2753}
2754
2755/*
2756 * mdi_pi_alloc():
2757 *		Allocate and initialize a new instance of a mdi_pathinfo node.
2758 *		The mdi_pathinfo node returned by this function identifies a
2759 *		unique device path is capable of having properties attached
2760 *		and passed to mdi_pi_online() to fully attach and online the
2761 *		path and client device node.
2762 *		The mdi_pathinfo node returned by this function must be
2763 *		destroyed using mdi_pi_free() if the path is no longer
2764 *		operational or if the caller fails to attach a client device
2765 *		node when calling mdi_pi_online(). The framework will not free
2766 *		the resources allocated.
2767 *		This function can be called from both interrupt and kernel
2768 *		contexts.  DDI_NOSLEEP flag should be used while calling
2769 *		from interrupt contexts.
2770 * Return Values:
2771 *		MDI_SUCCESS
2772 *		MDI_FAILURE
2773 *		MDI_NOMEM
2774 */
2775/*ARGSUSED*/
2776int
2777mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2778    char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2779{
2780	mdi_vhci_t	*vh;
2781	mdi_phci_t	*ph;
2782	mdi_client_t	*ct;
2783	mdi_pathinfo_t	*pip = NULL;
2784	dev_info_t	*cdip;
2785	int		rv = MDI_NOMEM;
2786	int		path_allocated = 0;
2787
2788	MDI_DEBUG(2, (MDI_NOTE, pdip,
2789	    "cname %s: caddr@%s paddr@%s",
2790	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2791
2792	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2793	    ret_pip == NULL) {
2794		/* Nothing more to do */
2795		return (MDI_FAILURE);
2796	}
2797
2798	*ret_pip = NULL;
2799
2800	/* No allocations on detaching pHCI */
2801	if (DEVI_IS_DETACHING(pdip)) {
2802		/* Invalid pHCI device, return failure */
2803		MDI_DEBUG(1, (MDI_WARN, pdip,
2804		    "!detaching pHCI=%p", (void *)pdip));
2805		return (MDI_FAILURE);
2806	}
2807
2808	ph = i_devi_get_phci(pdip);
2809	ASSERT(ph != NULL);
2810	if (ph == NULL) {
2811		/* Invalid pHCI device, return failure */
2812		MDI_DEBUG(1, (MDI_WARN, pdip,
2813		    "!invalid pHCI=%p", (void *)pdip));
2814		return (MDI_FAILURE);
2815	}
2816
2817	MDI_PHCI_LOCK(ph);
2818	vh = ph->ph_vhci;
2819	if (vh == NULL) {
2820		/* Invalid vHCI device, return failure */
2821		MDI_DEBUG(1, (MDI_WARN, pdip,
2822		    "!invalid vHCI=%p", (void *)pdip));
2823		MDI_PHCI_UNLOCK(ph);
2824		return (MDI_FAILURE);
2825	}
2826
2827	if (MDI_PHCI_IS_READY(ph) == 0) {
2828		/*
2829		 * Do not allow new node creation when pHCI is in
2830		 * offline/suspended states
2831		 */
2832		MDI_DEBUG(1, (MDI_WARN, pdip,
2833		    "pHCI=%p is not ready", (void *)ph));
2834		MDI_PHCI_UNLOCK(ph);
2835		return (MDI_BUSY);
2836	}
2837	MDI_PHCI_UNSTABLE(ph);
2838	MDI_PHCI_UNLOCK(ph);
2839
2840	/* look for a matching client, create one if not found */
2841	MDI_VHCI_CLIENT_LOCK(vh);
2842	ct = i_mdi_client_find(vh, cname, caddr);
2843	if (ct == NULL) {
2844		ct = i_mdi_client_alloc(vh, cname, caddr);
2845		ASSERT(ct != NULL);
2846	}
2847
2848	if (ct->ct_dip == NULL) {
2849		/*
2850		 * Allocate a devinfo node
2851		 */
2852		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2853		    compatible, ncompatible);
2854		if (ct->ct_dip == NULL) {
2855			(void) i_mdi_client_free(vh, ct);
2856			goto fail;
2857		}
2858	}
2859	cdip = ct->ct_dip;
2860
2861	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2862	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2863
2864	MDI_CLIENT_LOCK(ct);
2865	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2866	while (pip != NULL) {
2867		/*
2868		 * Compare the unit address
2869		 */
2870		if ((MDI_PI(pip)->pi_phci == ph) &&
2871		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2872			break;
2873		}
2874		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2875	}
2876	MDI_CLIENT_UNLOCK(ct);
2877
2878	if (pip == NULL) {
2879		/*
2880		 * This is a new path for this client device.  Allocate and
2881		 * initialize a new pathinfo node
2882		 */
2883		pip = i_mdi_pi_alloc(ph, paddr, ct);
2884		ASSERT(pip != NULL);
2885		path_allocated = 1;
2886	}
2887	rv = MDI_SUCCESS;
2888
2889fail:
2890	/*
2891	 * Release the global mutex.
2892	 */
2893	MDI_VHCI_CLIENT_UNLOCK(vh);
2894
2895	/*
2896	 * Mark the pHCI as stable
2897	 */
2898	MDI_PHCI_LOCK(ph);
2899	MDI_PHCI_STABLE(ph);
2900	MDI_PHCI_UNLOCK(ph);
2901	*ret_pip = pip;
2902
2903	MDI_DEBUG(2, (MDI_NOTE, pdip,
2904	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2905
2906	if (path_allocated)
2907		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2908
2909	return (rv);
2910}
2911
2912/*ARGSUSED*/
2913int
2914mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2915    int flags, mdi_pathinfo_t **ret_pip)
2916{
2917	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2918	    flags, ret_pip));
2919}
2920
2921/*
2922 * i_mdi_pi_alloc():
2923 *		Allocate a mdi_pathinfo node and add to the pHCI path list
2924 * Return Values:
2925 *		mdi_pathinfo
2926 */
2927/*ARGSUSED*/
2928static mdi_pathinfo_t *
2929i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2930{
2931	mdi_pathinfo_t	*pip;
2932	int		ct_circular;
2933	int		ph_circular;
2934	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2935	char		*path_persistent;
2936	int		path_instance;
2937	mod_hash_val_t	hv;
2938
2939	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2940
2941	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2942	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2943	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2944	    MDI_PATHINFO_STATE_TRANSIENT;
2945
2946	if (MDI_PHCI_IS_USER_DISABLED(ph))
2947		MDI_PI_SET_USER_DISABLE(pip);
2948
2949	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2950		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2951
2952	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2953		MDI_PI_SET_DRV_DISABLE(pip);
2954
2955	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2956	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2957	MDI_PI(pip)->pi_client = ct;
2958	MDI_PI(pip)->pi_phci = ph;
2959	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2960	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2961
2962        /*
2963	 * We form the "path" to the pathinfo node, and see if we have
2964	 * already allocated a 'path_instance' for that "path".  If so,
2965	 * we use the already allocated 'path_instance'.  If not, we
2966	 * allocate a new 'path_instance' and associate it with a copy of
2967	 * the "path" string (which is never freed). The association
2968	 * between a 'path_instance' this "path" string persists until
2969	 * reboot.
2970	 */
2971        mutex_enter(&mdi_pathmap_mutex);
2972	(void) ddi_pathname(ph->ph_dip, path);
2973	(void) sprintf(path + strlen(path), "/%s@%s",
2974	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2975        if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2976                path_instance = (uint_t)(intptr_t)hv;
2977        } else {
2978		/* allocate a new 'path_instance' and persistent "path" */
2979		path_instance = mdi_pathmap_instance++;
2980		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2981                (void) mod_hash_insert(mdi_pathmap_bypath,
2982                    (mod_hash_key_t)path_persistent,
2983                    (mod_hash_val_t)(intptr_t)path_instance);
2984		(void) mod_hash_insert(mdi_pathmap_byinstance,
2985		    (mod_hash_key_t)(intptr_t)path_instance,
2986		    (mod_hash_val_t)path_persistent);
2987
2988		/* create shortpath name */
2989		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2990		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2991		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2992		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2993		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
2994		    (mod_hash_key_t)(intptr_t)path_instance,
2995		    (mod_hash_val_t)path_persistent);
2996        }
2997        mutex_exit(&mdi_pathmap_mutex);
2998	MDI_PI(pip)->pi_path_instance = path_instance;
2999
3000	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3001	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3002	MDI_PI(pip)->pi_pprivate = NULL;
3003	MDI_PI(pip)->pi_cprivate = NULL;
3004	MDI_PI(pip)->pi_vprivate = NULL;
3005	MDI_PI(pip)->pi_client_link = NULL;
3006	MDI_PI(pip)->pi_phci_link = NULL;
3007	MDI_PI(pip)->pi_ref_cnt = 0;
3008	MDI_PI(pip)->pi_kstats = NULL;
3009	MDI_PI(pip)->pi_preferred = 1;
3010	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3011
3012	/*
3013	 * Lock both dev_info nodes against changes in parallel.
3014	 *
3015	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3016	 * This atypical operation is done to synchronize pathinfo nodes
3017	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3018	 * the pathinfo nodes are children of the Client.
3019	 */
3020	ndi_devi_enter(ct->ct_dip, &ct_circular);
3021	ndi_devi_enter(ph->ph_dip, &ph_circular);
3022
3023	i_mdi_phci_add_path(ph, pip);
3024	i_mdi_client_add_path(ct, pip);
3025
3026	ndi_devi_exit(ph->ph_dip, ph_circular);
3027	ndi_devi_exit(ct->ct_dip, ct_circular);
3028
3029	return (pip);
3030}
3031
3032/*
3033 * mdi_pi_pathname_by_instance():
3034 *	Lookup of "path" by 'path_instance'. Return "path".
3035 *	NOTE: returned "path" remains valid forever (until reboot).
3036 */
3037char *
3038mdi_pi_pathname_by_instance(int path_instance)
3039{
3040	char		*path;
3041	mod_hash_val_t	hv;
3042
3043	/* mdi_pathmap lookup of "path" by 'path_instance' */
3044	mutex_enter(&mdi_pathmap_mutex);
3045	if (mod_hash_find(mdi_pathmap_byinstance,
3046	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3047		path = (char *)hv;
3048	else
3049		path = NULL;
3050	mutex_exit(&mdi_pathmap_mutex);
3051	return (path);
3052}
3053
3054/*
3055 * mdi_pi_spathname_by_instance():
3056 *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3057 *	NOTE: returned "shortpath" remains valid forever (until reboot).
3058 */
3059char *
3060mdi_pi_spathname_by_instance(int path_instance)
3061{
3062	char		*path;
3063	mod_hash_val_t	hv;
3064
3065	/* mdi_pathmap lookup of "path" by 'path_instance' */
3066	mutex_enter(&mdi_pathmap_mutex);
3067	if (mod_hash_find(mdi_pathmap_sbyinstance,
3068	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3069		path = (char *)hv;
3070	else
3071		path = NULL;
3072	mutex_exit(&mdi_pathmap_mutex);
3073	return (path);
3074}
3075
3076
3077/*
3078 * i_mdi_phci_add_path():
3079 * 		Add a mdi_pathinfo node to pHCI list.
3080 * Notes:
3081 *		Caller should per-pHCI mutex
3082 */
3083static void
3084i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3085{
3086	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3087
3088	MDI_PHCI_LOCK(ph);
3089	if (ph->ph_path_head == NULL) {
3090		ph->ph_path_head = pip;
3091	} else {
3092		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3093	}
3094	ph->ph_path_tail = pip;
3095	ph->ph_path_count++;
3096	MDI_PHCI_UNLOCK(ph);
3097}
3098
3099/*
3100 * i_mdi_client_add_path():
3101 *		Add mdi_pathinfo node to client list
3102 */
3103static void
3104i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3105{
3106	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3107
3108	MDI_CLIENT_LOCK(ct);
3109	if (ct->ct_path_head == NULL) {
3110		ct->ct_path_head = pip;
3111	} else {
3112		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3113	}
3114	ct->ct_path_tail = pip;
3115	ct->ct_path_count++;
3116	MDI_CLIENT_UNLOCK(ct);
3117}
3118
3119/*
3120 * mdi_pi_free():
3121 *		Free the mdi_pathinfo node and also client device node if this
3122 *		is the last path to the device
3123 * Return Values:
3124 *		MDI_SUCCESS
3125 *		MDI_FAILURE
3126 *		MDI_BUSY
3127 */
3128/*ARGSUSED*/
3129int
3130mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3131{
3132	int		rv;
3133	mdi_vhci_t	*vh;
3134	mdi_phci_t	*ph;
3135	mdi_client_t	*ct;
3136	int		(*f)();
3137	int		client_held = 0;
3138
3139	MDI_PI_LOCK(pip);
3140	ph = MDI_PI(pip)->pi_phci;
3141	ASSERT(ph != NULL);
3142	if (ph == NULL) {
3143		/*
3144		 * Invalid pHCI device, return failure
3145		 */
3146		MDI_DEBUG(1, (MDI_WARN, NULL,
3147		    "!invalid pHCI: pip %s %p",
3148		    mdi_pi_spathname(pip), (void *)pip));
3149		MDI_PI_UNLOCK(pip);
3150		return (MDI_FAILURE);
3151	}
3152
3153	vh = ph->ph_vhci;
3154	ASSERT(vh != NULL);
3155	if (vh == NULL) {
3156		/* Invalid pHCI device, return failure */
3157		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3158		    "!invalid vHCI: pip %s %p",
3159		    mdi_pi_spathname(pip), (void *)pip));
3160		MDI_PI_UNLOCK(pip);
3161		return (MDI_FAILURE);
3162	}
3163
3164	ct = MDI_PI(pip)->pi_client;
3165	ASSERT(ct != NULL);
3166	if (ct == NULL) {
3167		/*
3168		 * Invalid Client device, return failure
3169		 */
3170		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3171		    "!invalid client: pip %s %p",
3172		    mdi_pi_spathname(pip), (void *)pip));
3173		MDI_PI_UNLOCK(pip);
3174		return (MDI_FAILURE);
3175	}
3176
3177	/*
3178	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3179	 * if the node state is either offline or init and the reference count
3180	 * is zero.
3181	 */
3182	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3183	    MDI_PI_IS_INITING(pip))) {
3184		/*
3185		 * Node is busy
3186		 */
3187		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3188		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3189		MDI_PI_UNLOCK(pip);
3190		return (MDI_BUSY);
3191	}
3192
3193	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3194		/*
3195		 * Give a chance for pending I/Os to complete.
3196		 */
3197		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3198		    "!%d cmds still pending on path: %s %p",
3199		    MDI_PI(pip)->pi_ref_cnt,
3200		    mdi_pi_spathname(pip), (void *)pip));
3201		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3202		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3203		    TR_CLOCK_TICK) == -1) {
3204			/*
3205			 * The timeout time reached without ref_cnt being zero
3206			 * being signaled.
3207			 */
3208			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3209			    "!Timeout reached on path %s %p without the cond",
3210			    mdi_pi_spathname(pip), (void *)pip));
3211			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3212			    "!%d cmds still pending on path %s %p",
3213			    MDI_PI(pip)->pi_ref_cnt,
3214			    mdi_pi_spathname(pip), (void *)pip));
3215			MDI_PI_UNLOCK(pip);
3216			return (MDI_BUSY);
3217		}
3218	}
3219	if (MDI_PI(pip)->pi_pm_held) {
3220		client_held = 1;
3221	}
3222	MDI_PI_UNLOCK(pip);
3223
3224	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3225
3226	MDI_CLIENT_LOCK(ct);
3227
3228	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3229	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3230
3231	/*
3232	 * Wait till failover is complete before removing this node.
3233	 */
3234	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3235		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3236
3237	MDI_CLIENT_UNLOCK(ct);
3238	MDI_VHCI_CLIENT_LOCK(vh);
3239	MDI_CLIENT_LOCK(ct);
3240	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3241
3242	if (!MDI_PI_IS_INITING(pip)) {
3243		f = vh->vh_ops->vo_pi_uninit;
3244		if (f != NULL) {
3245			rv = (*f)(vh->vh_dip, pip, 0);
3246		}
3247	} else
3248		rv = MDI_SUCCESS;
3249
3250	/*
3251	 * If vo_pi_uninit() completed successfully.
3252	 */
3253	if (rv == MDI_SUCCESS) {
3254		if (client_held) {
3255			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3256			    "i_mdi_pm_rele_client\n"));
3257			i_mdi_pm_rele_client(ct, 1);
3258		}
3259		i_mdi_pi_free(ph, pip, ct);
3260		if (ct->ct_path_count == 0) {
3261			/*
3262			 * Client lost its last path.
3263			 * Clean up the client device
3264			 */
3265			MDI_CLIENT_UNLOCK(ct);
3266			(void) i_mdi_client_free(ct->ct_vhci, ct);
3267			MDI_VHCI_CLIENT_UNLOCK(vh);
3268			return (rv);
3269		}
3270	}
3271	MDI_CLIENT_UNLOCK(ct);
3272	MDI_VHCI_CLIENT_UNLOCK(vh);
3273
3274	if (rv == MDI_FAILURE)
3275		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3276
3277	return (rv);
3278}
3279
3280/*
3281 * i_mdi_pi_free():
3282 *		Free the mdi_pathinfo node
3283 */
3284static void
3285i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3286{
3287	int	ct_circular;
3288	int	ph_circular;
3289
3290	ASSERT(MDI_CLIENT_LOCKED(ct));
3291
3292	/*
3293	 * remove any per-path kstats
3294	 */
3295	i_mdi_pi_kstat_destroy(pip);
3296
3297	/* See comments in i_mdi_pi_alloc() */
3298	ndi_devi_enter(ct->ct_dip, &ct_circular);
3299	ndi_devi_enter(ph->ph_dip, &ph_circular);
3300
3301	i_mdi_client_remove_path(ct, pip);
3302	i_mdi_phci_remove_path(ph, pip);
3303
3304	ndi_devi_exit(ph->ph_dip, ph_circular);
3305	ndi_devi_exit(ct->ct_dip, ct_circular);
3306
3307	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3308	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3309	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3310	if (MDI_PI(pip)->pi_addr) {
3311		kmem_free(MDI_PI(pip)->pi_addr,
3312		    strlen(MDI_PI(pip)->pi_addr) + 1);
3313		MDI_PI(pip)->pi_addr = NULL;
3314	}
3315
3316	if (MDI_PI(pip)->pi_prop) {
3317		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3318		MDI_PI(pip)->pi_prop = NULL;
3319	}
3320	kmem_free(pip, sizeof (struct mdi_pathinfo));
3321}
3322
3323
3324/*
3325 * i_mdi_phci_remove_path():
3326 * 		Remove a mdi_pathinfo node from pHCI list.
3327 * Notes:
3328 *		Caller should hold per-pHCI mutex
3329 */
3330static void
3331i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3332{
3333	mdi_pathinfo_t	*prev = NULL;
3334	mdi_pathinfo_t	*path = NULL;
3335
3336	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3337
3338	MDI_PHCI_LOCK(ph);
3339	path = ph->ph_path_head;
3340	while (path != NULL) {
3341		if (path == pip) {
3342			break;
3343		}
3344		prev = path;
3345		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3346	}
3347
3348	if (path) {
3349		ph->ph_path_count--;
3350		if (prev) {
3351			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3352		} else {
3353			ph->ph_path_head =
3354			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3355		}
3356		if (ph->ph_path_tail == path) {
3357			ph->ph_path_tail = prev;
3358		}
3359	}
3360
3361	/*
3362	 * Clear the pHCI link
3363	 */
3364	MDI_PI(pip)->pi_phci_link = NULL;
3365	MDI_PI(pip)->pi_phci = NULL;
3366	MDI_PHCI_UNLOCK(ph);
3367}
3368
3369/*
3370 * i_mdi_client_remove_path():
3371 * 		Remove a mdi_pathinfo node from client path list.
3372 */
3373static void
3374i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3375{
3376	mdi_pathinfo_t	*prev = NULL;
3377	mdi_pathinfo_t	*path;
3378
3379	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3380
3381	ASSERT(MDI_CLIENT_LOCKED(ct));
3382	path = ct->ct_path_head;
3383	while (path != NULL) {
3384		if (path == pip) {
3385			break;
3386		}
3387		prev = path;
3388		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3389	}
3390
3391	if (path) {
3392		ct->ct_path_count--;
3393		if (prev) {
3394			MDI_PI(prev)->pi_client_link =
3395			    MDI_PI(path)->pi_client_link;
3396		} else {
3397			ct->ct_path_head =
3398			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3399		}
3400		if (ct->ct_path_tail == path) {
3401			ct->ct_path_tail = prev;
3402		}
3403		if (ct->ct_path_last == path) {
3404			ct->ct_path_last = ct->ct_path_head;
3405		}
3406	}
3407	MDI_PI(pip)->pi_client_link = NULL;
3408	MDI_PI(pip)->pi_client = NULL;
3409}
3410
3411/*
3412 * i_mdi_pi_state_change():
3413 *		online a mdi_pathinfo node
3414 *
3415 * Return Values:
3416 *		MDI_SUCCESS
3417 *		MDI_FAILURE
3418 */
3419/*ARGSUSED*/
3420static int
3421i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3422{
3423	int		rv = MDI_SUCCESS;
3424	mdi_vhci_t	*vh;
3425	mdi_phci_t	*ph;
3426	mdi_client_t	*ct;
3427	int		(*f)();
3428	dev_info_t	*cdip;
3429
3430	MDI_PI_LOCK(pip);
3431
3432	ph = MDI_PI(pip)->pi_phci;
3433	ASSERT(ph);
3434	if (ph == NULL) {
3435		/*
3436		 * Invalid pHCI device, fail the request
3437		 */
3438		MDI_PI_UNLOCK(pip);
3439		MDI_DEBUG(1, (MDI_WARN, NULL,
3440		    "!invalid phci: pip %s %p",
3441		    mdi_pi_spathname(pip), (void *)pip));
3442		return (MDI_FAILURE);
3443	}
3444
3445	vh = ph->ph_vhci;
3446	ASSERT(vh);
3447	if (vh == NULL) {
3448		/*
3449		 * Invalid vHCI device, fail the request
3450		 */
3451		MDI_PI_UNLOCK(pip);
3452		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3453		    "!invalid vhci: pip %s %p",
3454		    mdi_pi_spathname(pip), (void *)pip));
3455		return (MDI_FAILURE);
3456	}
3457
3458	ct = MDI_PI(pip)->pi_client;
3459	ASSERT(ct != NULL);
3460	if (ct == NULL) {
3461		/*
3462		 * Invalid client device, fail the request
3463		 */
3464		MDI_PI_UNLOCK(pip);
3465		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3466		    "!invalid client: pip %s %p",
3467		    mdi_pi_spathname(pip), (void *)pip));
3468		return (MDI_FAILURE);
3469	}
3470
3471	/*
3472	 * If this path has not been initialized yet, Callback vHCI driver's
3473	 * pathinfo node initialize entry point
3474	 */
3475
3476	if (MDI_PI_IS_INITING(pip)) {
3477		MDI_PI_UNLOCK(pip);
3478		f = vh->vh_ops->vo_pi_init;
3479		if (f != NULL) {
3480			rv = (*f)(vh->vh_dip, pip, 0);
3481			if (rv != MDI_SUCCESS) {
3482				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3483				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3484				    (void *)vh, mdi_pi_spathname(pip),
3485				    (void *)pip));
3486				return (MDI_FAILURE);
3487			}
3488		}
3489		MDI_PI_LOCK(pip);
3490		MDI_PI_CLEAR_TRANSIENT(pip);
3491	}
3492
3493	/*
3494	 * Do not allow state transition when pHCI is in offline/suspended
3495	 * states
3496	 */
3497	i_mdi_phci_lock(ph, pip);
3498	if (MDI_PHCI_IS_READY(ph) == 0) {
3499		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3500		    "!pHCI not ready, pHCI=%p", (void *)ph));
3501		MDI_PI_UNLOCK(pip);
3502		i_mdi_phci_unlock(ph);
3503		return (MDI_BUSY);
3504	}
3505	MDI_PHCI_UNSTABLE(ph);
3506	i_mdi_phci_unlock(ph);
3507
3508	/*
3509	 * Check if mdi_pathinfo state is in transient state.
3510	 * If yes, offlining is in progress and wait till transient state is
3511	 * cleared.
3512	 */
3513	if (MDI_PI_IS_TRANSIENT(pip)) {
3514		while (MDI_PI_IS_TRANSIENT(pip)) {
3515			cv_wait(&MDI_PI(pip)->pi_state_cv,
3516			    &MDI_PI(pip)->pi_mutex);
3517		}
3518	}
3519
3520	/*
3521	 * Grab the client lock in reverse order sequence and release the
3522	 * mdi_pathinfo mutex.
3523	 */
3524	i_mdi_client_lock(ct, pip);
3525	MDI_PI_UNLOCK(pip);
3526
3527	/*
3528	 * Wait till failover state is cleared
3529	 */
3530	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3531		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3532
3533	/*
3534	 * Mark the mdi_pathinfo node state as transient
3535	 */
3536	MDI_PI_LOCK(pip);
3537	switch (state) {
3538	case MDI_PATHINFO_STATE_ONLINE:
3539		MDI_PI_SET_ONLINING(pip);
3540		break;
3541
3542	case MDI_PATHINFO_STATE_STANDBY:
3543		MDI_PI_SET_STANDBYING(pip);
3544		break;
3545
3546	case MDI_PATHINFO_STATE_FAULT:
3547		/*
3548		 * Mark the pathinfo state as FAULTED
3549		 */
3550		MDI_PI_SET_FAULTING(pip);
3551		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3552		break;
3553
3554	case MDI_PATHINFO_STATE_OFFLINE:
3555		/*
3556		 * ndi_devi_offline() cannot hold pip or ct locks.
3557		 */
3558		MDI_PI_UNLOCK(pip);
3559
3560		/*
3561		 * If this is a user initiated path online->offline operation
3562		 * who's success would transition a client from DEGRADED to
3563		 * FAILED then only proceed if we can offline the client first.
3564		 */
3565		cdip = ct->ct_dip;
3566		if ((flag & NDI_USER_REQ) &&
3567		    MDI_PI_IS_ONLINE(pip) &&
3568		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3569			i_mdi_client_unlock(ct);
3570			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3571			if (rv != NDI_SUCCESS) {
3572				/*
3573				 * Convert to MDI error code
3574				 */
3575				switch (rv) {
3576				case NDI_BUSY:
3577					rv = MDI_BUSY;
3578					break;
3579				default:
3580					rv = MDI_FAILURE;
3581					break;
3582				}
3583				goto state_change_exit;
3584			} else {
3585				i_mdi_client_lock(ct, NULL);
3586			}
3587		}
3588		/*
3589		 * Mark the mdi_pathinfo node state as transient
3590		 */
3591		MDI_PI_LOCK(pip);
3592		MDI_PI_SET_OFFLINING(pip);
3593		break;
3594	}
3595	MDI_PI_UNLOCK(pip);
3596	MDI_CLIENT_UNSTABLE(ct);
3597	i_mdi_client_unlock(ct);
3598
3599	f = vh->vh_ops->vo_pi_state_change;
3600	if (f != NULL)
3601		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3602
3603	MDI_CLIENT_LOCK(ct);
3604	MDI_PI_LOCK(pip);
3605	if (rv == MDI_NOT_SUPPORTED) {
3606		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3607	}
3608	if (rv != MDI_SUCCESS) {
3609		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3610		    "vo_pi_state_change failed: rv %x", rv));
3611	}
3612	if (MDI_PI_IS_TRANSIENT(pip)) {
3613		if (rv == MDI_SUCCESS) {
3614			MDI_PI_CLEAR_TRANSIENT(pip);
3615		} else {
3616			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3617		}
3618	}
3619
3620	/*
3621	 * Wake anyone waiting for this mdi_pathinfo node
3622	 */
3623	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3624	MDI_PI_UNLOCK(pip);
3625
3626	/*
3627	 * Mark the client device as stable
3628	 */
3629	MDI_CLIENT_STABLE(ct);
3630	if (rv == MDI_SUCCESS) {
3631		if (ct->ct_unstable == 0) {
3632			cdip = ct->ct_dip;
3633
3634			/*
3635			 * Onlining the mdi_pathinfo node will impact the
3636			 * client state Update the client and dev_info node
3637			 * state accordingly
3638			 */
3639			rv = NDI_SUCCESS;
3640			i_mdi_client_update_state(ct);
3641			switch (MDI_CLIENT_STATE(ct)) {
3642			case MDI_CLIENT_STATE_OPTIMAL:
3643			case MDI_CLIENT_STATE_DEGRADED:
3644				if (cdip && !i_ddi_devi_attached(cdip) &&
3645				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3646				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3647
3648					/*
3649					 * Must do ndi_devi_online() through
3650					 * hotplug thread for deferred
3651					 * attach mechanism to work
3652					 */
3653					MDI_CLIENT_UNLOCK(ct);
3654					rv = ndi_devi_online(cdip, 0);
3655					MDI_CLIENT_LOCK(ct);
3656					if ((rv != NDI_SUCCESS) &&
3657					    (MDI_CLIENT_STATE(ct) ==
3658					    MDI_CLIENT_STATE_DEGRADED)) {
3659						/*
3660						 * ndi_devi_online failed.
3661						 * Reset client flags to
3662						 * offline.
3663						 */
3664						MDI_DEBUG(1, (MDI_WARN, cdip,
3665						    "!ndi_devi_online failed "
3666						    "error %x", rv));
3667						MDI_CLIENT_SET_OFFLINE(ct);
3668					}
3669					if (rv != NDI_SUCCESS) {
3670						/* Reset the path state */
3671						MDI_PI_LOCK(pip);
3672						MDI_PI(pip)->pi_state =
3673						    MDI_PI_OLD_STATE(pip);
3674						MDI_PI_UNLOCK(pip);
3675					}
3676				}
3677				break;
3678
3679			case MDI_CLIENT_STATE_FAILED:
3680				/*
3681				 * This is the last path case for
3682				 * non-user initiated events.
3683				 */
3684				if (((flag & NDI_USER_REQ) == 0) &&
3685				    cdip && (i_ddi_node_state(cdip) >=
3686				    DS_INITIALIZED)) {
3687					MDI_CLIENT_UNLOCK(ct);
3688					rv = ndi_devi_offline(cdip,
3689					    NDI_DEVFS_CLEAN);
3690					MDI_CLIENT_LOCK(ct);
3691
3692					if (rv != NDI_SUCCESS) {
3693						/*
3694						 * ndi_devi_offline failed.
3695						 * Reset client flags to
3696						 * online as the path could not
3697						 * be offlined.
3698						 */
3699						MDI_DEBUG(1, (MDI_WARN, cdip,
3700						    "!ndi_devi_offline failed: "
3701						    "error %x", rv));
3702						MDI_CLIENT_SET_ONLINE(ct);
3703					}
3704				}
3705				break;
3706			}
3707			/*
3708			 * Convert to MDI error code
3709			 */
3710			switch (rv) {
3711			case NDI_SUCCESS:
3712				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3713				i_mdi_report_path_state(ct, pip);
3714				rv = MDI_SUCCESS;
3715				break;
3716			case NDI_BUSY:
3717				rv = MDI_BUSY;
3718				break;
3719			default:
3720				rv = MDI_FAILURE;
3721				break;
3722			}
3723		}
3724	}
3725	MDI_CLIENT_UNLOCK(ct);
3726
3727state_change_exit:
3728	/*
3729	 * Mark the pHCI as stable again.
3730	 */
3731	MDI_PHCI_LOCK(ph);
3732	MDI_PHCI_STABLE(ph);
3733	MDI_PHCI_UNLOCK(ph);
3734	return (rv);
3735}
3736
3737/*
3738 * mdi_pi_online():
3739 *		Place the path_info node in the online state.  The path is
3740 *		now available to be selected by mdi_select_path() for
3741 *		transporting I/O requests to client devices.
3742 * Return Values:
3743 *		MDI_SUCCESS
3744 *		MDI_FAILURE
3745 */
3746int
3747mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3748{
3749	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3750	int		client_held = 0;
3751	int		rv;
3752
3753	ASSERT(ct != NULL);
3754	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3755	if (rv != MDI_SUCCESS)
3756		return (rv);
3757
3758	MDI_PI_LOCK(pip);
3759	if (MDI_PI(pip)->pi_pm_held == 0) {
3760		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3761		    "i_mdi_pm_hold_pip %p", (void *)pip));
3762		i_mdi_pm_hold_pip(pip);
3763		client_held = 1;
3764	}
3765	MDI_PI_UNLOCK(pip);
3766
3767	if (client_held) {
3768		MDI_CLIENT_LOCK(ct);
3769		if (ct->ct_power_cnt == 0) {
3770			rv = i_mdi_power_all_phci(ct);
3771		}
3772
3773		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3774		    "i_mdi_pm_hold_client %p", (void *)ct));
3775		i_mdi_pm_hold_client(ct, 1);
3776		MDI_CLIENT_UNLOCK(ct);
3777	}
3778
3779	return (rv);
3780}
3781
3782/*
3783 * mdi_pi_standby():
3784 *		Place the mdi_pathinfo node in standby state
3785 *
3786 * Return Values:
3787 *		MDI_SUCCESS
3788 *		MDI_FAILURE
3789 */
3790int
3791mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3792{
3793	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3794}
3795
3796/*
3797 * mdi_pi_fault():
3798 *		Place the mdi_pathinfo node in fault'ed state
3799 * Return Values:
3800 *		MDI_SUCCESS
3801 *		MDI_FAILURE
3802 */
3803int
3804mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3805{
3806	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3807}
3808
3809/*
3810 * mdi_pi_offline():
3811 *		Offline a mdi_pathinfo node.
3812 * Return Values:
3813 *		MDI_SUCCESS
3814 *		MDI_FAILURE
3815 */
3816int
3817mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3818{
3819	int	ret, client_held = 0;
3820	mdi_client_t	*ct;
3821
3822	/*
3823	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3824	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3825	 * should now just use NDI_USER_REQ.
3826	 */
3827	if (flags & NDI_DEVI_REMOVE) {
3828		flags &= ~NDI_DEVI_REMOVE;
3829		flags |= NDI_USER_REQ;
3830	}
3831
3832	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3833
3834	if (ret == MDI_SUCCESS) {
3835		MDI_PI_LOCK(pip);
3836		if (MDI_PI(pip)->pi_pm_held) {
3837			client_held = 1;
3838		}
3839		MDI_PI_UNLOCK(pip);
3840
3841		if (client_held) {
3842			ct = MDI_PI(pip)->pi_client;
3843			MDI_CLIENT_LOCK(ct);
3844			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3845			    "i_mdi_pm_rele_client\n"));
3846			i_mdi_pm_rele_client(ct, 1);
3847			MDI_CLIENT_UNLOCK(ct);
3848		}
3849	}
3850
3851	return (ret);
3852}
3853
3854/*
3855 * i_mdi_pi_offline():
3856 *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3857 */
3858static int
3859i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3860{
3861	dev_info_t	*vdip = NULL;
3862	mdi_vhci_t	*vh = NULL;
3863	mdi_client_t	*ct = NULL;
3864	int		(*f)();
3865	int		rv;
3866
3867	MDI_PI_LOCK(pip);
3868	ct = MDI_PI(pip)->pi_client;
3869	ASSERT(ct != NULL);
3870
3871	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3872		/*
3873		 * Give a chance for pending I/Os to complete.
3874		 */
3875		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3876		    "!%d cmds still pending on path %s %p",
3877		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3878		    (void *)pip));
3879		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3880		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3881		    TR_CLOCK_TICK) == -1) {
3882			/*
3883			 * The timeout time reached without ref_cnt being zero
3884			 * being signaled.
3885			 */
3886			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3887			    "!Timeout reached on path %s %p without the cond",
3888			    mdi_pi_spathname(pip), (void *)pip));
3889			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3890			    "!%d cmds still pending on path %s %p",
3891			    MDI_PI(pip)->pi_ref_cnt,
3892			    mdi_pi_spathname(pip), (void *)pip));
3893		}
3894	}
3895	vh = ct->ct_vhci;
3896	vdip = vh->vh_dip;
3897
3898	/*
3899	 * Notify vHCI that has registered this event
3900	 */
3901	ASSERT(vh->vh_ops);
3902	f = vh->vh_ops->vo_pi_state_change;
3903
3904	if (f != NULL) {
3905		MDI_PI_UNLOCK(pip);
3906		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3907		    flags)) != MDI_SUCCESS) {
3908			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3909			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3910			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3911			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3912		}
3913		MDI_PI_LOCK(pip);
3914	}
3915
3916	/*
3917	 * Set the mdi_pathinfo node state and clear the transient condition
3918	 */
3919	MDI_PI_SET_OFFLINE(pip);
3920	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3921	MDI_PI_UNLOCK(pip);
3922
3923	MDI_CLIENT_LOCK(ct);
3924	if (rv == MDI_SUCCESS) {
3925		if (ct->ct_unstable == 0) {
3926			dev_info_t	*cdip = ct->ct_dip;
3927
3928			/*
3929			 * Onlining the mdi_pathinfo node will impact the
3930			 * client state Update the client and dev_info node
3931			 * state accordingly
3932			 */
3933			i_mdi_client_update_state(ct);
3934			rv = NDI_SUCCESS;
3935			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3936				if (cdip &&
3937				    (i_ddi_node_state(cdip) >=
3938				    DS_INITIALIZED)) {
3939					MDI_CLIENT_UNLOCK(ct);
3940					rv = ndi_devi_offline(cdip,
3941					    NDI_DEVFS_CLEAN);
3942					MDI_CLIENT_LOCK(ct);
3943					if (rv != NDI_SUCCESS) {
3944						/*
3945						 * ndi_devi_offline failed.
3946						 * Reset client flags to
3947						 * online.
3948						 */
3949						MDI_DEBUG(4, (MDI_WARN, cdip,
3950						    "ndi_devi_offline failed: "
3951						    "error %x", rv));
3952						MDI_CLIENT_SET_ONLINE(ct);
3953					}
3954				}
3955			}
3956			/*
3957			 * Convert to MDI error code
3958			 */
3959			switch (rv) {
3960			case NDI_SUCCESS:
3961				rv = MDI_SUCCESS;
3962				break;
3963			case NDI_BUSY:
3964				rv = MDI_BUSY;
3965				break;
3966			default:
3967				rv = MDI_FAILURE;
3968				break;
3969			}
3970		}
3971		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3972		i_mdi_report_path_state(ct, pip);
3973	}
3974
3975	MDI_CLIENT_UNLOCK(ct);
3976
3977	/*
3978	 * Change in the mdi_pathinfo node state will impact the client state
3979	 */
3980	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3981	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3982	return (rv);
3983}
3984
3985/*
3986 * i_mdi_pi_online():
3987 *		Online a mdi_pathinfo node and call the vHCI driver's callback
3988 */
3989static int
3990i_mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3991{
3992	mdi_vhci_t	*vh = NULL;
3993	mdi_client_t	*ct = NULL;
3994	mdi_phci_t	*ph;
3995	int		(*f)();
3996	int		rv;
3997
3998	MDI_PI_LOCK(pip);
3999	ph = MDI_PI(pip)->pi_phci;
4000	vh = ph->ph_vhci;
4001	ct = MDI_PI(pip)->pi_client;
4002	MDI_PI_SET_ONLINING(pip)
4003	MDI_PI_UNLOCK(pip);
4004	f = vh->vh_ops->vo_pi_state_change;
4005	if (f != NULL)
4006		rv = (*f)(vh->vh_dip, pip, MDI_PATHINFO_STATE_ONLINE, 0,
4007		    flags);
4008	MDI_CLIENT_LOCK(ct);
4009	MDI_PI_LOCK(pip);
4010	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
4011	MDI_PI_UNLOCK(pip);
4012	if (rv == MDI_SUCCESS) {
4013		dev_info_t	*cdip = ct->ct_dip;
4014
4015		rv = MDI_SUCCESS;
4016		i_mdi_client_update_state(ct);
4017		if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL ||
4018		    MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4019			if (cdip && !i_ddi_devi_attached(cdip)) {
4020				MDI_CLIENT_UNLOCK(ct);
4021				rv = ndi_devi_online(cdip, 0);
4022				MDI_CLIENT_LOCK(ct);
4023				if ((rv != NDI_SUCCESS) &&
4024				    (MDI_CLIENT_STATE(ct) ==
4025				    MDI_CLIENT_STATE_DEGRADED)) {
4026					MDI_CLIENT_SET_OFFLINE(ct);
4027				}
4028				if (rv != NDI_SUCCESS) {
4029					/* Reset the path state */
4030					MDI_PI_LOCK(pip);
4031					MDI_PI(pip)->pi_state =
4032					    MDI_PI_OLD_STATE(pip);
4033					MDI_PI_UNLOCK(pip);
4034				}
4035			}
4036		}
4037		switch (rv) {
4038		case NDI_SUCCESS:
4039			MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
4040			i_mdi_report_path_state(ct, pip);
4041			rv = MDI_SUCCESS;
4042			break;
4043		case NDI_BUSY:
4044			rv = MDI_BUSY;
4045			break;
4046		default:
4047			rv = MDI_FAILURE;
4048			break;
4049		}
4050	} else {
4051		/* Reset the path state */
4052		MDI_PI_LOCK(pip);
4053		MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
4054		MDI_PI_UNLOCK(pip);
4055	}
4056	MDI_CLIENT_UNLOCK(ct);
4057	return (rv);
4058}
4059
4060/*
4061 * mdi_pi_get_node_name():
4062 *              Get the name associated with a mdi_pathinfo node.
4063 *              Since pathinfo nodes are not directly named, we
4064 *              return the node_name of the client.
4065 *
4066 * Return Values:
4067 *              char *
4068 */
4069char *
4070mdi_pi_get_node_name(mdi_pathinfo_t *pip)
4071{
4072	mdi_client_t    *ct;
4073
4074	if (pip == NULL)
4075		return (NULL);
4076	ct = MDI_PI(pip)->pi_client;
4077	if ((ct == NULL) || (ct->ct_dip == NULL))
4078		return (NULL);
4079	return (ddi_node_name(ct->ct_dip));
4080}
4081
4082/*
4083 * mdi_pi_get_addr():
4084 *		Get the unit address associated with a mdi_pathinfo node
4085 *
4086 * Return Values:
4087 *		char *
4088 */
4089char *
4090mdi_pi_get_addr(mdi_pathinfo_t *pip)
4091{
4092	if (pip == NULL)
4093		return (NULL);
4094
4095	return (MDI_PI(pip)->pi_addr);
4096}
4097
4098/*
4099 * mdi_pi_get_path_instance():
4100 *		Get the 'path_instance' of a mdi_pathinfo node
4101 *
4102 * Return Values:
4103 *		path_instance
4104 */
4105int
4106mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4107{
4108	if (pip == NULL)
4109		return (0);
4110
4111	return (MDI_PI(pip)->pi_path_instance);
4112}
4113
4114/*
4115 * mdi_pi_pathname():
4116 *		Return pointer to path to pathinfo node.
4117 */
4118char *
4119mdi_pi_pathname(mdi_pathinfo_t *pip)
4120{
4121	if (pip == NULL)
4122		return (NULL);
4123	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4124}
4125
4126/*
4127 * mdi_pi_spathname():
4128 *		Return pointer to shortpath to pathinfo node. Used for debug
4129 *		messages, so return "" instead of NULL when unknown.
4130 */
4131char *
4132mdi_pi_spathname(mdi_pathinfo_t *pip)
4133{
4134	char	*spath = "";
4135
4136	if (pip) {
4137		spath = mdi_pi_spathname_by_instance(
4138		    mdi_pi_get_path_instance(pip));
4139		if (spath == NULL)
4140			spath = "";
4141	}
4142	return (spath);
4143}
4144
4145char *
4146mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4147{
4148	char *obp_path = NULL;
4149	if ((pip == NULL) || (path == NULL))
4150		return (NULL);
4151
4152	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4153		(void) strcpy(path, obp_path);
4154		(void) mdi_prop_free(obp_path);
4155	} else {
4156		path = NULL;
4157	}
4158	return (path);
4159}
4160
4161int
4162mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4163{
4164	dev_info_t *pdip;
4165	char *obp_path = NULL;
4166	int rc = MDI_FAILURE;
4167
4168	if (pip == NULL)
4169		return (MDI_FAILURE);
4170
4171	pdip = mdi_pi_get_phci(pip);
4172	if (pdip == NULL)
4173		return (MDI_FAILURE);
4174
4175	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4176
4177	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4178		(void) ddi_pathname(pdip, obp_path);
4179	}
4180
4181	if (component) {
4182		(void) strncat(obp_path, "/", MAXPATHLEN);
4183		(void) strncat(obp_path, component, MAXPATHLEN);
4184	}
4185	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4186
4187	if (obp_path)
4188		kmem_free(obp_path, MAXPATHLEN);
4189	return (rc);
4190}
4191
4192/*
4193 * mdi_pi_get_client():
4194 *		Get the client devinfo associated with a mdi_pathinfo node
4195 *
4196 * Return Values:
4197 *		Handle to client device dev_info node
4198 */
4199dev_info_t *
4200mdi_pi_get_client(mdi_pathinfo_t *pip)
4201{
4202	dev_info_t	*dip = NULL;
4203	if (pip) {
4204		dip = MDI_PI(pip)->pi_client->ct_dip;
4205	}
4206	return (dip);
4207}
4208
4209/*
4210 * mdi_pi_get_phci():
4211 *		Get the pHCI devinfo associated with the mdi_pathinfo node
4212 * Return Values:
4213 *		Handle to dev_info node
4214 */
4215dev_info_t *
4216mdi_pi_get_phci(mdi_pathinfo_t *pip)
4217{
4218	dev_info_t	*dip = NULL;
4219	mdi_phci_t	*ph;
4220
4221	if (pip) {
4222		ph = MDI_PI(pip)->pi_phci;
4223		if (ph)
4224			dip = ph->ph_dip;
4225	}
4226	return (dip);
4227}
4228
4229/*
4230 * mdi_pi_get_client_private():
4231 *		Get the client private information associated with the
4232 *		mdi_pathinfo node
4233 */
4234void *
4235mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4236{
4237	void *cprivate = NULL;
4238	if (pip) {
4239		cprivate = MDI_PI(pip)->pi_cprivate;
4240	}
4241	return (cprivate);
4242}
4243
4244/*
4245 * mdi_pi_set_client_private():
4246 *		Set the client private information in the mdi_pathinfo node
4247 */
4248void
4249mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4250{
4251	if (pip) {
4252		MDI_PI(pip)->pi_cprivate = priv;
4253	}
4254}
4255
4256/*
4257 * mdi_pi_get_phci_private():
4258 *		Get the pHCI private information associated with the
4259 *		mdi_pathinfo node
4260 */
4261caddr_t
4262mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4263{
4264	caddr_t	pprivate = NULL;
4265
4266	if (pip) {
4267		pprivate = MDI_PI(pip)->pi_pprivate;
4268	}
4269	return (pprivate);
4270}
4271
4272/*
4273 * mdi_pi_set_phci_private():
4274 *		Set the pHCI private information in the mdi_pathinfo node
4275 */
4276void
4277mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4278{
4279	if (pip) {
4280		MDI_PI(pip)->pi_pprivate = priv;
4281	}
4282}
4283
4284/*
4285 * mdi_pi_get_state():
4286 *		Get the mdi_pathinfo node state. Transient states are internal
4287 *		and not provided to the users
4288 */
4289mdi_pathinfo_state_t
4290mdi_pi_get_state(mdi_pathinfo_t *pip)
4291{
4292	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4293
4294	if (pip) {
4295		if (MDI_PI_IS_TRANSIENT(pip)) {
4296			/*
4297			 * mdi_pathinfo is in state transition.  Return the
4298			 * last good state.
4299			 */
4300			state = MDI_PI_OLD_STATE(pip);
4301		} else {
4302			state = MDI_PI_STATE(pip);
4303		}
4304	}
4305	return (state);
4306}
4307
4308/*
4309 * mdi_pi_get_flags():
4310 *		Get the mdi_pathinfo node flags.
4311 */
4312uint_t
4313mdi_pi_get_flags(mdi_pathinfo_t *pip)
4314{
4315	return (pip ? MDI_PI(pip)->pi_flags : 0);
4316}
4317
4318/*
4319 * Note that the following function needs to be the new interface for
4320 * mdi_pi_get_state when mpxio gets integrated to ON.
4321 */
4322int
4323mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4324		uint32_t *ext_state)
4325{
4326	*state = MDI_PATHINFO_STATE_INIT;
4327
4328	if (pip) {
4329		if (MDI_PI_IS_TRANSIENT(pip)) {
4330			/*
4331			 * mdi_pathinfo is in state transition.  Return the
4332			 * last good state.
4333			 */
4334			*state = MDI_PI_OLD_STATE(pip);
4335			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4336		} else {
4337			*state = MDI_PI_STATE(pip);
4338			*ext_state = MDI_PI_EXT_STATE(pip);
4339		}
4340	}
4341	return (MDI_SUCCESS);
4342}
4343
4344/*
4345 * mdi_pi_get_preferred:
4346 *	Get the preferred path flag
4347 */
4348int
4349mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4350{
4351	if (pip) {
4352		return (MDI_PI(pip)->pi_preferred);
4353	}
4354	return (0);
4355}
4356
4357/*
4358 * mdi_pi_set_preferred:
4359 *	Set the preferred path flag
4360 */
4361void
4362mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4363{
4364	if (pip) {
4365		MDI_PI(pip)->pi_preferred = preferred;
4366	}
4367}
4368
4369/*
4370 * mdi_pi_set_state():
4371 *		Set the mdi_pathinfo node state
4372 */
4373void
4374mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4375{
4376	uint32_t	ext_state;
4377
4378	if (pip) {
4379		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4380		MDI_PI(pip)->pi_state = state;
4381		MDI_PI(pip)->pi_state |= ext_state;
4382
4383		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4384		i_ddi_di_cache_invalidate();
4385	}
4386}
4387
4388/*
4389 * Property functions:
4390 */
4391int
4392i_map_nvlist_error_to_mdi(int val)
4393{
4394	int rv;
4395
4396	switch (val) {
4397	case 0:
4398		rv = DDI_PROP_SUCCESS;
4399		break;
4400	case EINVAL:
4401	case ENOTSUP:
4402		rv = DDI_PROP_INVAL_ARG;
4403		break;
4404	case ENOMEM:
4405		rv = DDI_PROP_NO_MEMORY;
4406		break;
4407	default:
4408		rv = DDI_PROP_NOT_FOUND;
4409		break;
4410	}
4411	return (rv);
4412}
4413
4414/*
4415 * mdi_pi_get_next_prop():
4416 * 		Property walk function.  The caller should hold mdi_pi_lock()
4417 *		and release by calling mdi_pi_unlock() at the end of walk to
4418 *		get a consistent value.
4419 */
4420nvpair_t *
4421mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4422{
4423	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4424		return (NULL);
4425	}
4426	ASSERT(MDI_PI_LOCKED(pip));
4427	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4428}
4429
4430/*
4431 * mdi_prop_remove():
4432 * 		Remove the named property from the named list.
4433 */
4434int
4435mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4436{
4437	if (pip == NULL) {
4438		return (DDI_PROP_NOT_FOUND);
4439	}
4440	ASSERT(!MDI_PI_LOCKED(pip));
4441	MDI_PI_LOCK(pip);
4442	if (MDI_PI(pip)->pi_prop == NULL) {
4443		MDI_PI_UNLOCK(pip);
4444		return (DDI_PROP_NOT_FOUND);
4445	}
4446	if (name) {
4447		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4448	} else {
4449		char		nvp_name[MAXNAMELEN];
4450		nvpair_t	*nvp;
4451		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4452		while (nvp) {
4453			nvpair_t	*next;
4454			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4455			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4456			    nvpair_name(nvp));
4457			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4458			    nvp_name);
4459			nvp = next;
4460		}
4461	}
4462	MDI_PI_UNLOCK(pip);
4463	return (DDI_PROP_SUCCESS);
4464}
4465
4466/*
4467 * mdi_prop_size():
4468 * 		Get buffer size needed to pack the property data.
4469 * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4470 *		buffer size.
4471 */
4472int
4473mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4474{
4475	int	rv;
4476	size_t	bufsize;
4477
4478	*buflenp = 0;
4479	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4480		return (DDI_PROP_NOT_FOUND);
4481	}
4482	ASSERT(MDI_PI_LOCKED(pip));
4483	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4484	    &bufsize, NV_ENCODE_NATIVE);
4485	*buflenp = bufsize;
4486	return (i_map_nvlist_error_to_mdi(rv));
4487}
4488
4489/*
4490 * mdi_prop_pack():
4491 * 		pack the property list.  The caller should hold the
4492 *		mdi_pathinfo_t node to get a consistent data
4493 */
4494int
4495mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4496{
4497	int	rv;
4498	size_t	bufsize;
4499
4500	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4501		return (DDI_PROP_NOT_FOUND);
4502	}
4503
4504	ASSERT(MDI_PI_LOCKED(pip));
4505
4506	bufsize = buflen;
4507	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4508	    NV_ENCODE_NATIVE, KM_SLEEP);
4509
4510	return (i_map_nvlist_error_to_mdi(rv));
4511}
4512
4513/*
4514 * mdi_prop_update_byte():
4515 *		Create/Update a byte property
4516 */
4517int
4518mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4519{
4520	int rv;
4521
4522	if (pip == NULL) {
4523		return (DDI_PROP_INVAL_ARG);
4524	}
4525	ASSERT(!MDI_PI_LOCKED(pip));
4526	MDI_PI_LOCK(pip);
4527	if (MDI_PI(pip)->pi_prop == NULL) {
4528		MDI_PI_UNLOCK(pip);
4529		return (DDI_PROP_NOT_FOUND);
4530	}
4531	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4532	MDI_PI_UNLOCK(pip);
4533	return (i_map_nvlist_error_to_mdi(rv));
4534}
4535
4536/*
4537 * mdi_prop_update_byte_array():
4538 *		Create/Update a byte array property
4539 */
4540int
4541mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4542    uint_t nelements)
4543{
4544	int rv;
4545
4546	if (pip == NULL) {
4547		return (DDI_PROP_INVAL_ARG);
4548	}
4549	ASSERT(!MDI_PI_LOCKED(pip));
4550	MDI_PI_LOCK(pip);
4551	if (MDI_PI(pip)->pi_prop == NULL) {
4552		MDI_PI_UNLOCK(pip);
4553		return (DDI_PROP_NOT_FOUND);
4554	}
4555	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4556	MDI_PI_UNLOCK(pip);
4557	return (i_map_nvlist_error_to_mdi(rv));
4558}
4559
4560/*
4561 * mdi_prop_update_int():
4562 *		Create/Update a 32 bit integer property
4563 */
4564int
4565mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4566{
4567	int rv;
4568
4569	if (pip == NULL) {
4570		return (DDI_PROP_INVAL_ARG);
4571	}
4572	ASSERT(!MDI_PI_LOCKED(pip));
4573	MDI_PI_LOCK(pip);
4574	if (MDI_PI(pip)->pi_prop == NULL) {
4575		MDI_PI_UNLOCK(pip);
4576		return (DDI_PROP_NOT_FOUND);
4577	}
4578	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4579	MDI_PI_UNLOCK(pip);
4580	return (i_map_nvlist_error_to_mdi(rv));
4581}
4582
4583/*
4584 * mdi_prop_update_int64():
4585 *		Create/Update a 64 bit integer property
4586 */
4587int
4588mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4589{
4590	int rv;
4591
4592	if (pip == NULL) {
4593		return (DDI_PROP_INVAL_ARG);
4594	}
4595	ASSERT(!MDI_PI_LOCKED(pip));
4596	MDI_PI_LOCK(pip);
4597	if (MDI_PI(pip)->pi_prop == NULL) {
4598		MDI_PI_UNLOCK(pip);
4599		return (DDI_PROP_NOT_FOUND);
4600	}
4601	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4602	MDI_PI_UNLOCK(pip);
4603	return (i_map_nvlist_error_to_mdi(rv));
4604}
4605
4606/*
4607 * mdi_prop_update_int_array():
4608 *		Create/Update a int array property
4609 */
4610int
4611mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4612	    uint_t nelements)
4613{
4614	int rv;
4615
4616	if (pip == NULL) {
4617		return (DDI_PROP_INVAL_ARG);
4618	}
4619	ASSERT(!MDI_PI_LOCKED(pip));
4620	MDI_PI_LOCK(pip);
4621	if (MDI_PI(pip)->pi_prop == NULL) {
4622		MDI_PI_UNLOCK(pip);
4623		return (DDI_PROP_NOT_FOUND);
4624	}
4625	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4626	    nelements);
4627	MDI_PI_UNLOCK(pip);
4628	return (i_map_nvlist_error_to_mdi(rv));
4629}
4630
4631/*
4632 * mdi_prop_update_string():
4633 *		Create/Update a string property
4634 */
4635int
4636mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4637{
4638	int rv;
4639
4640	if (pip == NULL) {
4641		return (DDI_PROP_INVAL_ARG);
4642	}
4643	ASSERT(!MDI_PI_LOCKED(pip));
4644	MDI_PI_LOCK(pip);
4645	if (MDI_PI(pip)->pi_prop == NULL) {
4646		MDI_PI_UNLOCK(pip);
4647		return (DDI_PROP_NOT_FOUND);
4648	}
4649	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4650	MDI_PI_UNLOCK(pip);
4651	return (i_map_nvlist_error_to_mdi(rv));
4652}
4653
4654/*
4655 * mdi_prop_update_string_array():
4656 *		Create/Update a string array property
4657 */
4658int
4659mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4660    uint_t nelements)
4661{
4662	int rv;
4663
4664	if (pip == NULL) {
4665		return (DDI_PROP_INVAL_ARG);
4666	}
4667	ASSERT(!MDI_PI_LOCKED(pip));
4668	MDI_PI_LOCK(pip);
4669	if (MDI_PI(pip)->pi_prop == NULL) {
4670		MDI_PI_UNLOCK(pip);
4671		return (DDI_PROP_NOT_FOUND);
4672	}
4673	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4674	    nelements);
4675	MDI_PI_UNLOCK(pip);
4676	return (i_map_nvlist_error_to_mdi(rv));
4677}
4678
4679/*
4680 * mdi_prop_lookup_byte():
4681 * 		Look for byte property identified by name.  The data returned
4682 *		is the actual property and valid as long as mdi_pathinfo_t node
4683 *		is alive.
4684 */
4685int
4686mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4687{
4688	int rv;
4689
4690	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4691		return (DDI_PROP_NOT_FOUND);
4692	}
4693	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4694	return (i_map_nvlist_error_to_mdi(rv));
4695}
4696
4697
4698/*
4699 * mdi_prop_lookup_byte_array():
4700 * 		Look for byte array property identified by name.  The data
4701 *		returned is the actual property and valid as long as
4702 *		mdi_pathinfo_t node is alive.
4703 */
4704int
4705mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4706    uint_t *nelements)
4707{
4708	int rv;
4709
4710	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4711		return (DDI_PROP_NOT_FOUND);
4712	}
4713	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4714	    nelements);
4715	return (i_map_nvlist_error_to_mdi(rv));
4716}
4717
4718/*
4719 * mdi_prop_lookup_int():
4720 * 		Look for int property identified by name.  The data returned
4721 *		is the actual property and valid as long as mdi_pathinfo_t
4722 *		node is alive.
4723 */
4724int
4725mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4726{
4727	int rv;
4728
4729	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4730		return (DDI_PROP_NOT_FOUND);
4731	}
4732	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4733	return (i_map_nvlist_error_to_mdi(rv));
4734}
4735
4736/*
4737 * mdi_prop_lookup_int64():
4738 * 		Look for int64 property identified by name.  The data returned
4739 *		is the actual property and valid as long as mdi_pathinfo_t node
4740 *		is alive.
4741 */
4742int
4743mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4744{
4745	int rv;
4746	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4747		return (DDI_PROP_NOT_FOUND);
4748	}
4749	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4750	return (i_map_nvlist_error_to_mdi(rv));
4751}
4752
4753/*
4754 * mdi_prop_lookup_int_array():
4755 * 		Look for int array property identified by name.  The data
4756 *		returned is the actual property and valid as long as
4757 *		mdi_pathinfo_t node is alive.
4758 */
4759int
4760mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4761    uint_t *nelements)
4762{
4763	int rv;
4764
4765	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4766		return (DDI_PROP_NOT_FOUND);
4767	}
4768	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4769	    (int32_t **)data, nelements);
4770	return (i_map_nvlist_error_to_mdi(rv));
4771}
4772
4773/*
4774 * mdi_prop_lookup_string():
4775 * 		Look for string property identified by name.  The data
4776 *		returned is the actual property and valid as long as
4777 *		mdi_pathinfo_t node is alive.
4778 */
4779int
4780mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4781{
4782	int rv;
4783
4784	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4785		return (DDI_PROP_NOT_FOUND);
4786	}
4787	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4788	return (i_map_nvlist_error_to_mdi(rv));
4789}
4790
4791/*
4792 * mdi_prop_lookup_string_array():
4793 * 		Look for string array property identified by name.  The data
4794 *		returned is the actual property and valid as long as
4795 *		mdi_pathinfo_t node is alive.
4796 */
4797int
4798mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4799    uint_t *nelements)
4800{
4801	int rv;
4802
4803	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4804		return (DDI_PROP_NOT_FOUND);
4805	}
4806	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4807	    nelements);
4808	return (i_map_nvlist_error_to_mdi(rv));
4809}
4810
4811/*
4812 * mdi_prop_free():
4813 * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4814 *		functions return the pointer to actual property data and not a
4815 *		copy of it.  So the data returned is valid as long as
4816 *		mdi_pathinfo_t node is valid.
4817 */
4818/*ARGSUSED*/
4819int
4820mdi_prop_free(void *data)
4821{
4822	return (DDI_PROP_SUCCESS);
4823}
4824
4825/*ARGSUSED*/
4826static void
4827i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4828{
4829	char		*ct_path;
4830	char		*ct_status;
4831	char		*status;
4832	dev_info_t	*cdip = ct->ct_dip;
4833	char		lb_buf[64];
4834	int		report_lb_c = 0, report_lb_p = 0;
4835
4836	ASSERT(MDI_CLIENT_LOCKED(ct));
4837	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4838	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4839		return;
4840	}
4841	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4842		ct_status = "optimal";
4843		report_lb_c = 1;
4844	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4845		ct_status = "degraded";
4846	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4847		ct_status = "failed";
4848	} else {
4849		ct_status = "unknown";
4850	}
4851
4852	lb_buf[0] = 0;		/* not interested in load balancing config */
4853
4854	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4855		status = "removed";
4856	} else if (MDI_PI_IS_OFFLINE(pip)) {
4857		status = "offline";
4858	} else if (MDI_PI_IS_ONLINE(pip)) {
4859		status = "online";
4860		report_lb_p = 1;
4861	} else if (MDI_PI_IS_STANDBY(pip)) {
4862		status = "standby";
4863	} else if (MDI_PI_IS_FAULT(pip)) {
4864		status = "faulted";
4865	} else {
4866		status = "unknown";
4867	}
4868
4869	if (cdip) {
4870		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4871
4872		/*
4873		 * NOTE: Keeping "multipath status: %s" and
4874		 * "Load balancing: %s" format unchanged in case someone
4875		 * scrubs /var/adm/messages looking for these messages.
4876		 */
4877		if (report_lb_c && report_lb_p) {
4878			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4879				(void) snprintf(lb_buf, sizeof (lb_buf),
4880				    "%s, region-size: %d", mdi_load_balance_lba,
4881				    ct->ct_lb_args->region_size);
4882			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4883				(void) snprintf(lb_buf, sizeof (lb_buf),
4884				    "%s", mdi_load_balance_none);
4885			} else {
4886				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4887				    mdi_load_balance_rr);
4888			}
4889
4890			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4891			    "?%s (%s%d) multipath status: %s: "
4892			    "path %d %s is %s: Load balancing: %s\n",
4893			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4894			    ddi_get_instance(cdip), ct_status,
4895			    mdi_pi_get_path_instance(pip),
4896			    mdi_pi_spathname(pip), status, lb_buf);
4897		} else {
4898			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4899			    "?%s (%s%d) multipath status: %s: "
4900			    "path %d %s is %s\n",
4901			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4902			    ddi_get_instance(cdip), ct_status,
4903			    mdi_pi_get_path_instance(pip),
4904			    mdi_pi_spathname(pip), status);
4905		}
4906
4907		kmem_free(ct_path, MAXPATHLEN);
4908		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4909	}
4910}
4911
4912#ifdef	DEBUG
4913/*
4914 * i_mdi_log():
4915 *		Utility function for error message management
4916 *
4917 *		NOTE: Implementation takes care of trailing \n for cmn_err,
4918 *		MDI_DEBUG should not terminate fmt strings with \n.
4919 *
4920 *		NOTE: If the level is >= 2, and there is no leading !?^
4921 *		then a leading ! is implied (but can be overriden via
4922 *		mdi_debug_consoleonly). If you are using kmdb on the console,
4923 *		consider setting mdi_debug_consoleonly to 1 as an aid.
4924 */
4925/*PRINTFLIKE4*/
4926static void
4927i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4928{
4929	char		name[MAXNAMELEN];
4930	char		buf[512];
4931	char		*bp;
4932	va_list		ap;
4933	int		log_only = 0;
4934	int		boot_only = 0;
4935	int		console_only = 0;
4936
4937	if (dip) {
4938		(void) snprintf(name, sizeof(name), "%s%d: ",
4939		    ddi_driver_name(dip), ddi_get_instance(dip));
4940	} else {
4941		name[0] = 0;
4942	}
4943
4944	va_start(ap, fmt);
4945	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4946	va_end(ap);
4947
4948	switch (buf[0]) {
4949	case '!':
4950		bp = &buf[1];
4951		log_only = 1;
4952		break;
4953	case '?':
4954		bp = &buf[1];
4955		boot_only = 1;
4956		break;
4957	case '^':
4958		bp = &buf[1];
4959		console_only = 1;
4960		break;
4961	default:
4962		if (level >= 2)
4963			log_only = 1;		/* ! implied */
4964		bp = buf;
4965		break;
4966	}
4967	if (mdi_debug_logonly) {
4968		log_only = 1;
4969		boot_only = 0;
4970		console_only = 0;
4971	}
4972	if (mdi_debug_consoleonly) {
4973		log_only = 0;
4974		boot_only = 0;
4975		console_only = 1;
4976		level = CE_NOTE;
4977		goto console;
4978	}
4979
4980	switch (level) {
4981	case CE_NOTE:
4982		level = CE_CONT;
4983		/* FALLTHROUGH */
4984	case CE_CONT:
4985		if (boot_only) {
4986			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4987		} else if (console_only) {
4988			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4989		} else if (log_only) {
4990			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4991		} else {
4992			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4993		}
4994		break;
4995
4996	case CE_WARN:
4997	case CE_PANIC:
4998	console:
4999		if (boot_only) {
5000			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
5001		} else if (console_only) {
5002			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
5003		} else if (log_only) {
5004			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
5005		} else {
5006			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
5007		}
5008		break;
5009	default:
5010		cmn_err(level, "mdi: %s%s", name, bp);
5011		break;
5012	}
5013}
5014#endif	/* DEBUG */
5015
5016void
5017i_mdi_client_online(dev_info_t *ct_dip)
5018{
5019	mdi_client_t	*ct;
5020
5021	/*
5022	 * Client online notification. Mark client state as online
5023	 * restore our binding with dev_info node
5024	 */
5025	ct = i_devi_get_client(ct_dip);
5026	ASSERT(ct != NULL);
5027	MDI_CLIENT_LOCK(ct);
5028	MDI_CLIENT_SET_ONLINE(ct);
5029	/* catch for any memory leaks */
5030	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
5031	ct->ct_dip = ct_dip;
5032
5033	if (ct->ct_power_cnt == 0)
5034		(void) i_mdi_power_all_phci(ct);
5035
5036	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
5037	    "i_mdi_pm_hold_client %p", (void *)ct));
5038	i_mdi_pm_hold_client(ct, 1);
5039
5040	MDI_CLIENT_UNLOCK(ct);
5041}
5042
5043void
5044i_mdi_phci_online(dev_info_t *ph_dip)
5045{
5046	mdi_phci_t	*ph;
5047
5048	/* pHCI online notification. Mark state accordingly */
5049	ph = i_devi_get_phci(ph_dip);
5050	ASSERT(ph != NULL);
5051	MDI_PHCI_LOCK(ph);
5052	MDI_PHCI_SET_ONLINE(ph);
5053	MDI_PHCI_UNLOCK(ph);
5054}
5055
5056/*
5057 * mdi_devi_online():
5058 * 		Online notification from NDI framework on pHCI/client
5059 *		device online.
5060 * Return Values:
5061 *		NDI_SUCCESS
5062 *		MDI_FAILURE
5063 */
5064/*ARGSUSED*/
5065int
5066mdi_devi_online(dev_info_t *dip, uint_t flags)
5067{
5068	if (MDI_PHCI(dip)) {
5069		i_mdi_phci_online(dip);
5070	}
5071
5072	if (MDI_CLIENT(dip)) {
5073		i_mdi_client_online(dip);
5074	}
5075	return (NDI_SUCCESS);
5076}
5077
5078/*
5079 * mdi_devi_offline():
5080 * 		Offline notification from NDI framework on pHCI/Client device
5081 *		offline.
5082 *
5083 * Return Values:
5084 *		NDI_SUCCESS
5085 *		NDI_FAILURE
5086 */
5087/*ARGSUSED*/
5088int
5089mdi_devi_offline(dev_info_t *dip, uint_t flags)
5090{
5091	int		rv = NDI_SUCCESS;
5092
5093	if (MDI_CLIENT(dip)) {
5094		rv = i_mdi_client_offline(dip, flags);
5095		if (rv != NDI_SUCCESS)
5096			return (rv);
5097	}
5098
5099	if (MDI_PHCI(dip)) {
5100		rv = i_mdi_phci_offline(dip, flags);
5101
5102		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5103			/* set client back online */
5104			i_mdi_client_online(dip);
5105		}
5106	}
5107
5108	return (rv);
5109}
5110
5111/*ARGSUSED*/
5112static int
5113i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5114{
5115	int		rv = NDI_SUCCESS;
5116	mdi_phci_t	*ph;
5117	mdi_client_t	*ct;
5118	mdi_pathinfo_t	*pip;
5119	mdi_pathinfo_t	*next;
5120	mdi_pathinfo_t	*failed_pip = NULL;
5121	dev_info_t	*cdip;
5122
5123	/*
5124	 * pHCI component offline notification
5125	 * Make sure that this pHCI instance is free to be offlined.
5126	 * If it is OK to proceed, Offline and remove all the child
5127	 * mdi_pathinfo nodes.  This process automatically offlines
5128	 * corresponding client devices, for which this pHCI provides
5129	 * critical services.
5130	 */
5131	ph = i_devi_get_phci(dip);
5132	MDI_DEBUG(2, (MDI_NOTE, dip,
5133	    "called %p %p", (void *)dip, (void *)ph));
5134	if (ph == NULL) {
5135		return (rv);
5136	}
5137
5138	MDI_PHCI_LOCK(ph);
5139
5140	if (MDI_PHCI_IS_OFFLINE(ph)) {
5141		MDI_DEBUG(1, (MDI_WARN, dip,
5142		    "!pHCI already offlined: %p", (void *)dip));
5143		MDI_PHCI_UNLOCK(ph);
5144		return (NDI_SUCCESS);
5145	}
5146
5147	/*
5148	 * Check to see if the pHCI can be offlined
5149	 */
5150	if (ph->ph_unstable) {
5151		MDI_DEBUG(1, (MDI_WARN, dip,
5152		    "!One or more target devices are in transient state. "
5153		    "This device can not be removed at this moment. "
5154		    "Please try again later."));
5155		MDI_PHCI_UNLOCK(ph);
5156		return (NDI_BUSY);
5157	}
5158
5159	pip = ph->ph_path_head;
5160	while (pip != NULL) {
5161		MDI_PI_LOCK(pip);
5162		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5163
5164		/*
5165		 * The mdi_pathinfo state is OK. Check the client state.
5166		 * If failover in progress fail the pHCI from offlining
5167		 */
5168		ct = MDI_PI(pip)->pi_client;
5169		i_mdi_client_lock(ct, pip);
5170		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5171		    (ct->ct_unstable)) {
5172			/*
5173			 * Failover is in progress, Fail the DR
5174			 */
5175			MDI_DEBUG(1, (MDI_WARN, dip,
5176			    "!pHCI device is busy. "
5177			    "This device can not be removed at this moment. "
5178			    "Please try again later."));
5179			MDI_PI_UNLOCK(pip);
5180			i_mdi_client_unlock(ct);
5181			MDI_PHCI_UNLOCK(ph);
5182			return (NDI_BUSY);
5183		}
5184		MDI_PI_UNLOCK(pip);
5185
5186		/*
5187		 * Check to see of we are removing the last path of this
5188		 * client device...
5189		 */
5190		cdip = ct->ct_dip;
5191		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5192		    (i_mdi_client_compute_state(ct, ph) ==
5193		    MDI_CLIENT_STATE_FAILED)) {
5194			i_mdi_client_unlock(ct);
5195			MDI_PHCI_UNLOCK(ph);
5196			if (ndi_devi_offline(cdip,
5197			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5198				/*
5199				 * ndi_devi_offline() failed.
5200				 * This pHCI provides the critical path
5201				 * to one or more client devices.
5202				 * Return busy.
5203				 */
5204				MDI_PHCI_LOCK(ph);
5205				MDI_DEBUG(1, (MDI_WARN, dip,
5206				    "!pHCI device is busy. "
5207				    "This device can not be removed at this "
5208				    "moment. Please try again later."));
5209				failed_pip = pip;
5210				break;
5211			} else {
5212				MDI_PHCI_LOCK(ph);
5213				pip = next;
5214			}
5215		} else {
5216			i_mdi_client_unlock(ct);
5217			pip = next;
5218		}
5219	}
5220
5221	if (failed_pip) {
5222		pip = ph->ph_path_head;
5223		while (pip != failed_pip) {
5224			MDI_PI_LOCK(pip);
5225			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5226			ct = MDI_PI(pip)->pi_client;
5227			i_mdi_client_lock(ct, pip);
5228			cdip = ct->ct_dip;
5229			switch (MDI_CLIENT_STATE(ct)) {
5230			case MDI_CLIENT_STATE_OPTIMAL:
5231			case MDI_CLIENT_STATE_DEGRADED:
5232				if (cdip) {
5233					MDI_PI_UNLOCK(pip);
5234					i_mdi_client_unlock(ct);
5235					MDI_PHCI_UNLOCK(ph);
5236					(void) ndi_devi_online(cdip, 0);
5237					MDI_PHCI_LOCK(ph);
5238					pip = next;
5239					continue;
5240				}
5241				break;
5242
5243			case MDI_CLIENT_STATE_FAILED:
5244				if (cdip) {
5245					MDI_PI_UNLOCK(pip);
5246					i_mdi_client_unlock(ct);
5247					MDI_PHCI_UNLOCK(ph);
5248					(void) ndi_devi_offline(cdip,
5249						NDI_DEVFS_CLEAN);
5250					MDI_PHCI_LOCK(ph);
5251					pip = next;
5252					continue;
5253				}
5254				break;
5255			}
5256			MDI_PI_UNLOCK(pip);
5257			i_mdi_client_unlock(ct);
5258			pip = next;
5259		}
5260		MDI_PHCI_UNLOCK(ph);
5261		return (NDI_BUSY);
5262	}
5263
5264	/*
5265	 * Mark the pHCI as offline
5266	 */
5267	MDI_PHCI_SET_OFFLINE(ph);
5268
5269	/*
5270	 * Mark the child mdi_pathinfo nodes as transient
5271	 */
5272	pip = ph->ph_path_head;
5273	while (pip != NULL) {
5274		MDI_PI_LOCK(pip);
5275		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5276		MDI_PI_SET_OFFLINING(pip);
5277		MDI_PI_UNLOCK(pip);
5278		pip = next;
5279	}
5280	MDI_PHCI_UNLOCK(ph);
5281	/*
5282	 * Give a chance for any pending commands to execute
5283	 */
5284	delay_random(mdi_delay);
5285	MDI_PHCI_LOCK(ph);
5286	pip = ph->ph_path_head;
5287	while (pip != NULL) {
5288		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5289		(void) i_mdi_pi_offline(pip, flags);
5290		MDI_PI_LOCK(pip);
5291		ct = MDI_PI(pip)->pi_client;
5292		if (!MDI_PI_IS_OFFLINE(pip)) {
5293			MDI_DEBUG(1, (MDI_WARN, dip,
5294			    "!pHCI device is busy. "
5295			    "This device can not be removed at this moment. "
5296			    "Please try again later."));
5297			MDI_PI_UNLOCK(pip);
5298			MDI_PHCI_SET_ONLINE(ph);
5299			MDI_PHCI_UNLOCK(ph);
5300			return (NDI_BUSY);
5301		}
5302		MDI_PI_UNLOCK(pip);
5303		pip = next;
5304	}
5305	MDI_PHCI_UNLOCK(ph);
5306
5307	return (rv);
5308}
5309
5310void
5311mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5312{
5313	mdi_phci_t	*ph;
5314	mdi_client_t	*ct;
5315	mdi_pathinfo_t	*pip;
5316	mdi_pathinfo_t	*next;
5317	dev_info_t	*cdip;
5318
5319	if (!MDI_PHCI(dip))
5320		return;
5321
5322	ph = i_devi_get_phci(dip);
5323	if (ph == NULL) {
5324		return;
5325	}
5326
5327	MDI_PHCI_LOCK(ph);
5328
5329	if (MDI_PHCI_IS_OFFLINE(ph)) {
5330		/* has no last path */
5331		MDI_PHCI_UNLOCK(ph);
5332		return;
5333	}
5334
5335	pip = ph->ph_path_head;
5336	while (pip != NULL) {
5337		MDI_PI_LOCK(pip);
5338		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5339
5340		ct = MDI_PI(pip)->pi_client;
5341		i_mdi_client_lock(ct, pip);
5342		MDI_PI_UNLOCK(pip);
5343
5344		cdip = ct->ct_dip;
5345		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5346		    (i_mdi_client_compute_state(ct, ph) ==
5347		    MDI_CLIENT_STATE_FAILED)) {
5348			/* Last path. Mark client dip as retiring */
5349			i_mdi_client_unlock(ct);
5350			MDI_PHCI_UNLOCK(ph);
5351			(void) e_ddi_mark_retiring(cdip, cons_array);
5352			MDI_PHCI_LOCK(ph);
5353			pip = next;
5354		} else {
5355			i_mdi_client_unlock(ct);
5356			pip = next;
5357		}
5358	}
5359
5360	MDI_PHCI_UNLOCK(ph);
5361
5362	return;
5363}
5364
5365void
5366mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5367{
5368	mdi_phci_t	*ph;
5369	mdi_client_t	*ct;
5370	mdi_pathinfo_t	*pip;
5371	mdi_pathinfo_t	*next;
5372	dev_info_t	*cdip;
5373
5374	if (!MDI_PHCI(dip))
5375		return;
5376
5377	ph = i_devi_get_phci(dip);
5378	if (ph == NULL)
5379		return;
5380
5381	MDI_PHCI_LOCK(ph);
5382
5383	if (MDI_PHCI_IS_OFFLINE(ph)) {
5384		MDI_PHCI_UNLOCK(ph);
5385		/* not last path */
5386		return;
5387	}
5388
5389	if (ph->ph_unstable) {
5390		MDI_PHCI_UNLOCK(ph);
5391		/* can't check for constraints */
5392		*constraint = 0;
5393		return;
5394	}
5395
5396	pip = ph->ph_path_head;
5397	while (pip != NULL) {
5398		MDI_PI_LOCK(pip);
5399		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5400
5401		/*
5402		 * The mdi_pathinfo state is OK. Check the client state.
5403		 * If failover in progress fail the pHCI from offlining
5404		 */
5405		ct = MDI_PI(pip)->pi_client;
5406		i_mdi_client_lock(ct, pip);
5407		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5408		    (ct->ct_unstable)) {
5409			/*
5410			 * Failover is in progress, can't check for constraints
5411			 */
5412			MDI_PI_UNLOCK(pip);
5413			i_mdi_client_unlock(ct);
5414			MDI_PHCI_UNLOCK(ph);
5415			*constraint = 0;
5416			return;
5417		}
5418		MDI_PI_UNLOCK(pip);
5419
5420		/*
5421		 * Check to see of we are retiring the last path of this
5422		 * client device...
5423		 */
5424		cdip = ct->ct_dip;
5425		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5426		    (i_mdi_client_compute_state(ct, ph) ==
5427		    MDI_CLIENT_STATE_FAILED)) {
5428			i_mdi_client_unlock(ct);
5429			MDI_PHCI_UNLOCK(ph);
5430			(void) e_ddi_retire_notify(cdip, constraint);
5431			MDI_PHCI_LOCK(ph);
5432			pip = next;
5433		} else {
5434			i_mdi_client_unlock(ct);
5435			pip = next;
5436		}
5437	}
5438
5439	MDI_PHCI_UNLOCK(ph);
5440
5441	return;
5442}
5443
5444/*
5445 * offline the path(s) hanging off the pHCI. If the
5446 * last path to any client, check that constraints
5447 * have been applied.
5448 *
5449 * If constraint is 0, we aren't going to retire the
5450 * pHCI. However we still need to go through the paths
5451 * calling e_ddi_retire_finalize() to clear their
5452 * contract barriers.
5453 */
5454void
5455mdi_phci_retire_finalize(dev_info_t *dip, int phci_only, void *constraint)
5456{
5457	mdi_phci_t	*ph;
5458	mdi_client_t	*ct;
5459	mdi_pathinfo_t	*pip;
5460	mdi_pathinfo_t	*next;
5461	dev_info_t	*cdip;
5462	int		unstable = 0;
5463	int		tmp_constraint;
5464
5465	if (!MDI_PHCI(dip))
5466		return;
5467
5468	ph = i_devi_get_phci(dip);
5469	if (ph == NULL) {
5470		/* no last path and no pips */
5471		return;
5472	}
5473
5474	MDI_PHCI_LOCK(ph);
5475
5476	if (MDI_PHCI_IS_OFFLINE(ph)) {
5477		MDI_PHCI_UNLOCK(ph);
5478		/* no last path and no pips */
5479		return;
5480	}
5481
5482	/*
5483	 * Check to see if the pHCI can be offlined
5484	 */
5485	if (ph->ph_unstable) {
5486		unstable = 1;
5487	}
5488
5489	pip = ph->ph_path_head;
5490	while (pip != NULL) {
5491		MDI_PI_LOCK(pip);
5492		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5493
5494		/*
5495		 * if failover in progress fail the pHCI from offlining
5496		 */
5497		ct = MDI_PI(pip)->pi_client;
5498		i_mdi_client_lock(ct, pip);
5499		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5500		    (ct->ct_unstable)) {
5501			unstable = 1;
5502		}
5503		MDI_PI_UNLOCK(pip);
5504
5505		/*
5506		 * Check to see of we are removing the last path of this
5507		 * client device...
5508		 */
5509		cdip = ct->ct_dip;
5510		if (!phci_only && cdip &&
5511		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5512		    (i_mdi_client_compute_state(ct, ph) ==
5513		    MDI_CLIENT_STATE_FAILED)) {
5514			i_mdi_client_unlock(ct);
5515			MDI_PHCI_UNLOCK(ph);
5516			/*
5517			 * This is the last path to this client.
5518			 *
5519			 * Constraint will only be set to 1 if this client can
5520			 * be retired (as already determined by
5521			 * mdi_phci_retire_notify). However we don't actually
5522			 * need to retire the client (we just retire the last
5523			 * path - MPXIO will then fail all I/Os to the client).
5524			 * But we still need to call e_ddi_retire_finalize so
5525			 * the contract barriers can be cleared. Therefore we
5526			 * temporarily set constraint = 0 so that the client
5527			 * dip is not retired.
5528			 */
5529			tmp_constraint = 0;
5530			(void) e_ddi_retire_finalize(cdip, &tmp_constraint);
5531			MDI_PHCI_LOCK(ph);
5532			pip = next;
5533		} else {
5534			i_mdi_client_unlock(ct);
5535			pip = next;
5536		}
5537	}
5538
5539	if (!phci_only && *((int *)constraint) == 0) {
5540		MDI_PHCI_UNLOCK(ph);
5541		return;
5542	}
5543
5544	/*
5545	 * Cannot offline pip(s)
5546	 */
5547	if (unstable) {
5548		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5549		    "pHCI in transient state, cannot retire",
5550		    ddi_driver_name(dip), ddi_get_instance(dip));
5551		MDI_PHCI_UNLOCK(ph);
5552		return;
5553	}
5554
5555	/*
5556	 * Mark the pHCI as offline
5557	 */
5558	MDI_PHCI_SET_OFFLINE(ph);
5559
5560	/*
5561	 * Mark the child mdi_pathinfo nodes as transient
5562	 */
5563	pip = ph->ph_path_head;
5564	while (pip != NULL) {
5565		MDI_PI_LOCK(pip);
5566		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5567		MDI_PI_SET_OFFLINING(pip);
5568		MDI_PI_UNLOCK(pip);
5569		pip = next;
5570	}
5571	MDI_PHCI_UNLOCK(ph);
5572	/*
5573	 * Give a chance for any pending commands to execute
5574	 */
5575	delay_random(mdi_delay);
5576	MDI_PHCI_LOCK(ph);
5577	pip = ph->ph_path_head;
5578	while (pip != NULL) {
5579		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5580		(void) i_mdi_pi_offline(pip, 0);
5581		MDI_PI_LOCK(pip);
5582		ct = MDI_PI(pip)->pi_client;
5583		if (!MDI_PI_IS_OFFLINE(pip)) {
5584			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5585			    "path %d %s busy, cannot offline",
5586			    mdi_pi_get_path_instance(pip),
5587			    mdi_pi_spathname(pip));
5588			MDI_PI_UNLOCK(pip);
5589			MDI_PHCI_SET_ONLINE(ph);
5590			MDI_PHCI_UNLOCK(ph);
5591			return;
5592		}
5593		MDI_PI_UNLOCK(pip);
5594		pip = next;
5595	}
5596	MDI_PHCI_UNLOCK(ph);
5597
5598	return;
5599}
5600
5601void
5602mdi_phci_unretire(dev_info_t *dip)
5603{
5604	mdi_phci_t	*ph;
5605	mdi_pathinfo_t	*pip;
5606	mdi_pathinfo_t	*next;
5607
5608	ASSERT(MDI_PHCI(dip));
5609
5610	/*
5611	 * Online the phci
5612	 */
5613	i_mdi_phci_online(dip);
5614
5615	ph = i_devi_get_phci(dip);
5616	MDI_PHCI_LOCK(ph);
5617	pip = ph->ph_path_head;
5618	while (pip != NULL) {
5619		MDI_PI_LOCK(pip);
5620		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5621		MDI_PI_UNLOCK(pip);
5622		(void) i_mdi_pi_online(pip, 0);
5623		pip = next;
5624	}
5625	MDI_PHCI_UNLOCK(ph);
5626}
5627
5628/*ARGSUSED*/
5629static int
5630i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5631{
5632	int		rv = NDI_SUCCESS;
5633	mdi_client_t	*ct;
5634
5635	/*
5636	 * Client component to go offline.  Make sure that we are
5637	 * not in failing over state and update client state
5638	 * accordingly
5639	 */
5640	ct = i_devi_get_client(dip);
5641	MDI_DEBUG(2, (MDI_NOTE, dip,
5642	    "called %p %p", (void *)dip, (void *)ct));
5643	if (ct != NULL) {
5644		MDI_CLIENT_LOCK(ct);
5645		if (ct->ct_unstable) {
5646			/*
5647			 * One or more paths are in transient state,
5648			 * Dont allow offline of a client device
5649			 */
5650			MDI_DEBUG(1, (MDI_WARN, dip,
5651			    "!One or more paths to "
5652			    "this device are in transient state. "
5653			    "This device can not be removed at this moment. "
5654			    "Please try again later."));
5655			MDI_CLIENT_UNLOCK(ct);
5656			return (NDI_BUSY);
5657		}
5658		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5659			/*
5660			 * Failover is in progress, Dont allow DR of
5661			 * a client device
5662			 */
5663			MDI_DEBUG(1, (MDI_WARN, dip,
5664			    "!Client device is Busy. "
5665			    "This device can not be removed at this moment. "
5666			    "Please try again later."));
5667			MDI_CLIENT_UNLOCK(ct);
5668			return (NDI_BUSY);
5669		}
5670		MDI_CLIENT_SET_OFFLINE(ct);
5671
5672		/*
5673		 * Unbind our relationship with the dev_info node
5674		 */
5675		if (flags & NDI_DEVI_REMOVE) {
5676			ct->ct_dip = NULL;
5677		}
5678		MDI_CLIENT_UNLOCK(ct);
5679	}
5680	return (rv);
5681}
5682
5683/*
5684 * mdi_pre_attach():
5685 *		Pre attach() notification handler
5686 */
5687/*ARGSUSED*/
5688int
5689mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5690{
5691	/* don't support old DDI_PM_RESUME */
5692	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5693	    (cmd == DDI_PM_RESUME))
5694		return (DDI_FAILURE);
5695
5696	return (DDI_SUCCESS);
5697}
5698
5699/*
5700 * mdi_post_attach():
5701 *		Post attach() notification handler
5702 */
5703/*ARGSUSED*/
5704void
5705mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5706{
5707	mdi_phci_t	*ph;
5708	mdi_client_t	*ct;
5709	mdi_vhci_t	*vh;
5710
5711	if (MDI_PHCI(dip)) {
5712		ph = i_devi_get_phci(dip);
5713		ASSERT(ph != NULL);
5714
5715		MDI_PHCI_LOCK(ph);
5716		switch (cmd) {
5717		case DDI_ATTACH:
5718			MDI_DEBUG(2, (MDI_NOTE, dip,
5719			    "phci post_attach called %p", (void *)ph));
5720			if (error == DDI_SUCCESS) {
5721				MDI_PHCI_SET_ATTACH(ph);
5722			} else {
5723				MDI_DEBUG(1, (MDI_NOTE, dip,
5724				    "!pHCI post_attach failed: error %d",
5725				    error));
5726				MDI_PHCI_SET_DETACH(ph);
5727			}
5728			break;
5729
5730		case DDI_RESUME:
5731			MDI_DEBUG(2, (MDI_NOTE, dip,
5732			    "pHCI post_resume: called %p", (void *)ph));
5733			if (error == DDI_SUCCESS) {
5734				MDI_PHCI_SET_RESUME(ph);
5735			} else {
5736				MDI_DEBUG(1, (MDI_NOTE, dip,
5737				    "!pHCI post_resume failed: error %d",
5738				    error));
5739				MDI_PHCI_SET_SUSPEND(ph);
5740			}
5741			break;
5742		}
5743		MDI_PHCI_UNLOCK(ph);
5744	}
5745
5746	if (MDI_CLIENT(dip)) {
5747		ct = i_devi_get_client(dip);
5748		ASSERT(ct != NULL);
5749
5750		MDI_CLIENT_LOCK(ct);
5751		switch (cmd) {
5752		case DDI_ATTACH:
5753			MDI_DEBUG(2, (MDI_NOTE, dip,
5754			    "client post_attach called %p", (void *)ct));
5755			if (error != DDI_SUCCESS) {
5756				MDI_DEBUG(1, (MDI_NOTE, dip,
5757				    "!client post_attach failed: error %d",
5758				    error));
5759				MDI_CLIENT_SET_DETACH(ct);
5760				MDI_DEBUG(4, (MDI_WARN, dip,
5761				    "i_mdi_pm_reset_client"));
5762				i_mdi_pm_reset_client(ct);
5763				break;
5764			}
5765
5766			/*
5767			 * Client device has successfully attached, inform
5768			 * the vhci.
5769			 */
5770			vh = ct->ct_vhci;
5771			if (vh->vh_ops->vo_client_attached)
5772				(*vh->vh_ops->vo_client_attached)(dip);
5773
5774			MDI_CLIENT_SET_ATTACH(ct);
5775			break;
5776
5777		case DDI_RESUME:
5778			MDI_DEBUG(2, (MDI_NOTE, dip,
5779			    "client post_attach: called %p", (void *)ct));
5780			if (error == DDI_SUCCESS) {
5781				MDI_CLIENT_SET_RESUME(ct);
5782			} else {
5783				MDI_DEBUG(1, (MDI_NOTE, dip,
5784				    "!client post_resume failed: error %d",
5785				    error));
5786				MDI_CLIENT_SET_SUSPEND(ct);
5787			}
5788			break;
5789		}
5790		MDI_CLIENT_UNLOCK(ct);
5791	}
5792}
5793
5794/*
5795 * mdi_pre_detach():
5796 *		Pre detach notification handler
5797 */
5798/*ARGSUSED*/
5799int
5800mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5801{
5802	int rv = DDI_SUCCESS;
5803
5804	if (MDI_CLIENT(dip)) {
5805		(void) i_mdi_client_pre_detach(dip, cmd);
5806	}
5807
5808	if (MDI_PHCI(dip)) {
5809		rv = i_mdi_phci_pre_detach(dip, cmd);
5810	}
5811
5812	return (rv);
5813}
5814
5815/*ARGSUSED*/
5816static int
5817i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5818{
5819	int		rv = DDI_SUCCESS;
5820	mdi_phci_t	*ph;
5821	mdi_client_t	*ct;
5822	mdi_pathinfo_t	*pip;
5823	mdi_pathinfo_t	*failed_pip = NULL;
5824	mdi_pathinfo_t	*next;
5825
5826	ph = i_devi_get_phci(dip);
5827	if (ph == NULL) {
5828		return (rv);
5829	}
5830
5831	MDI_PHCI_LOCK(ph);
5832	switch (cmd) {
5833	case DDI_DETACH:
5834		MDI_DEBUG(2, (MDI_NOTE, dip,
5835		    "pHCI pre_detach: called %p", (void *)ph));
5836		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5837			/*
5838			 * mdi_pathinfo nodes are still attached to
5839			 * this pHCI. Fail the detach for this pHCI.
5840			 */
5841			MDI_DEBUG(2, (MDI_WARN, dip,
5842			    "pHCI pre_detach: paths are still attached %p",
5843			    (void *)ph));
5844			rv = DDI_FAILURE;
5845			break;
5846		}
5847		MDI_PHCI_SET_DETACH(ph);
5848		break;
5849
5850	case DDI_SUSPEND:
5851		/*
5852		 * pHCI is getting suspended.  Since mpxio client
5853		 * devices may not be suspended at this point, to avoid
5854		 * a potential stack overflow, it is important to suspend
5855		 * client devices before pHCI can be suspended.
5856		 */
5857
5858		MDI_DEBUG(2, (MDI_NOTE, dip,
5859		    "pHCI pre_suspend: called %p", (void *)ph));
5860		/*
5861		 * Suspend all the client devices accessible through this pHCI
5862		 */
5863		pip = ph->ph_path_head;
5864		while (pip != NULL && rv == DDI_SUCCESS) {
5865			dev_info_t *cdip;
5866			MDI_PI_LOCK(pip);
5867			next =
5868			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5869			ct = MDI_PI(pip)->pi_client;
5870			i_mdi_client_lock(ct, pip);
5871			cdip = ct->ct_dip;
5872			MDI_PI_UNLOCK(pip);
5873			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5874			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5875				i_mdi_client_unlock(ct);
5876				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5877				    DDI_SUCCESS) {
5878					/*
5879					 * Suspend of one of the client
5880					 * device has failed.
5881					 */
5882					MDI_DEBUG(1, (MDI_WARN, dip,
5883					    "!suspend of device (%s%d) failed.",
5884					    ddi_driver_name(cdip),
5885					    ddi_get_instance(cdip)));
5886					failed_pip = pip;
5887					break;
5888				}
5889			} else {
5890				i_mdi_client_unlock(ct);
5891			}
5892			pip = next;
5893		}
5894
5895		if (rv == DDI_SUCCESS) {
5896			/*
5897			 * Suspend of client devices is complete. Proceed
5898			 * with pHCI suspend.
5899			 */
5900			MDI_PHCI_SET_SUSPEND(ph);
5901		} else {
5902			/*
5903			 * Revert back all the suspended client device states
5904			 * to converse.
5905			 */
5906			pip = ph->ph_path_head;
5907			while (pip != failed_pip) {
5908				dev_info_t *cdip;
5909				MDI_PI_LOCK(pip);
5910				next =
5911				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5912				ct = MDI_PI(pip)->pi_client;
5913				i_mdi_client_lock(ct, pip);
5914				cdip = ct->ct_dip;
5915				MDI_PI_UNLOCK(pip);
5916				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5917					i_mdi_client_unlock(ct);
5918					(void) devi_attach(cdip, DDI_RESUME);
5919				} else {
5920					i_mdi_client_unlock(ct);
5921				}
5922				pip = next;
5923			}
5924		}
5925		break;
5926
5927	default:
5928		rv = DDI_FAILURE;
5929		break;
5930	}
5931	MDI_PHCI_UNLOCK(ph);
5932	return (rv);
5933}
5934
5935/*ARGSUSED*/
5936static int
5937i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5938{
5939	int		rv = DDI_SUCCESS;
5940	mdi_client_t	*ct;
5941
5942	ct = i_devi_get_client(dip);
5943	if (ct == NULL) {
5944		return (rv);
5945	}
5946
5947	MDI_CLIENT_LOCK(ct);
5948	switch (cmd) {
5949	case DDI_DETACH:
5950		MDI_DEBUG(2, (MDI_NOTE, dip,
5951		    "client pre_detach: called %p",
5952		     (void *)ct));
5953		MDI_CLIENT_SET_DETACH(ct);
5954		break;
5955
5956	case DDI_SUSPEND:
5957		MDI_DEBUG(2, (MDI_NOTE, dip,
5958		    "client pre_suspend: called %p",
5959		    (void *)ct));
5960		MDI_CLIENT_SET_SUSPEND(ct);
5961		break;
5962
5963	default:
5964		rv = DDI_FAILURE;
5965		break;
5966	}
5967	MDI_CLIENT_UNLOCK(ct);
5968	return (rv);
5969}
5970
5971/*
5972 * mdi_post_detach():
5973 *		Post detach notification handler
5974 */
5975/*ARGSUSED*/
5976void
5977mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5978{
5979	/*
5980	 * Detach/Suspend of mpxio component failed. Update our state
5981	 * too
5982	 */
5983	if (MDI_PHCI(dip))
5984		i_mdi_phci_post_detach(dip, cmd, error);
5985
5986	if (MDI_CLIENT(dip))
5987		i_mdi_client_post_detach(dip, cmd, error);
5988}
5989
5990/*ARGSUSED*/
5991static void
5992i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5993{
5994	mdi_phci_t	*ph;
5995
5996	/*
5997	 * Detach/Suspend of phci component failed. Update our state
5998	 * too
5999	 */
6000	ph = i_devi_get_phci(dip);
6001	if (ph == NULL) {
6002		return;
6003	}
6004
6005	MDI_PHCI_LOCK(ph);
6006	/*
6007	 * Detach of pHCI failed. Restore back converse
6008	 * state
6009	 */
6010	switch (cmd) {
6011	case DDI_DETACH:
6012		MDI_DEBUG(2, (MDI_NOTE, dip,
6013		    "pHCI post_detach: called %p",
6014		    (void *)ph));
6015		if (error != DDI_SUCCESS)
6016			MDI_PHCI_SET_ATTACH(ph);
6017		break;
6018
6019	case DDI_SUSPEND:
6020		MDI_DEBUG(2, (MDI_NOTE, dip,
6021		    "pHCI post_suspend: called %p",
6022		    (void *)ph));
6023		if (error != DDI_SUCCESS)
6024			MDI_PHCI_SET_RESUME(ph);
6025		break;
6026	}
6027	MDI_PHCI_UNLOCK(ph);
6028}
6029
6030/*ARGSUSED*/
6031static void
6032i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
6033{
6034	mdi_client_t	*ct;
6035
6036	ct = i_devi_get_client(dip);
6037	if (ct == NULL) {
6038		return;
6039	}
6040	MDI_CLIENT_LOCK(ct);
6041	/*
6042	 * Detach of Client failed. Restore back converse
6043	 * state
6044	 */
6045	switch (cmd) {
6046	case DDI_DETACH:
6047		MDI_DEBUG(2, (MDI_NOTE, dip,
6048		    "client post_detach: called %p", (void *)ct));
6049		if (DEVI_IS_ATTACHING(dip)) {
6050			MDI_DEBUG(4, (MDI_NOTE, dip,
6051			    "i_mdi_pm_rele_client\n"));
6052			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6053		} else {
6054			MDI_DEBUG(4, (MDI_NOTE, dip,
6055			    "i_mdi_pm_reset_client\n"));
6056			i_mdi_pm_reset_client(ct);
6057		}
6058		if (error != DDI_SUCCESS)
6059			MDI_CLIENT_SET_ATTACH(ct);
6060		break;
6061
6062	case DDI_SUSPEND:
6063		MDI_DEBUG(2, (MDI_NOTE, dip,
6064		    "called %p", (void *)ct));
6065		if (error != DDI_SUCCESS)
6066			MDI_CLIENT_SET_RESUME(ct);
6067		break;
6068	}
6069	MDI_CLIENT_UNLOCK(ct);
6070}
6071
6072int
6073mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
6074{
6075	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
6076}
6077
6078/*
6079 * create and install per-path (client - pHCI) statistics
6080 * I/O stats supported: nread, nwritten, reads, and writes
6081 * Error stats - hard errors, soft errors, & transport errors
6082 */
6083int
6084mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
6085{
6086	kstat_t			*kiosp, *kerrsp;
6087	struct pi_errs		*nsp;
6088	struct mdi_pi_kstats	*mdi_statp;
6089
6090	if (MDI_PI(pip)->pi_kstats != NULL)
6091		return (MDI_SUCCESS);
6092
6093	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
6094	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
6095		return (MDI_FAILURE);
6096	}
6097
6098	(void) strcat(ksname, ",err");
6099	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
6100	    KSTAT_TYPE_NAMED,
6101	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
6102	if (kerrsp == NULL) {
6103		kstat_delete(kiosp);
6104		return (MDI_FAILURE);
6105	}
6106
6107	nsp = (struct pi_errs *)kerrsp->ks_data;
6108	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
6109	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
6110	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
6111	    KSTAT_DATA_UINT32);
6112	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6113	    KSTAT_DATA_UINT32);
6114	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6115	    KSTAT_DATA_UINT32);
6116	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6117	    KSTAT_DATA_UINT32);
6118	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6119	    KSTAT_DATA_UINT32);
6120	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6121	    KSTAT_DATA_UINT32);
6122	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6123	    KSTAT_DATA_UINT32);
6124	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6125
6126	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6127	mdi_statp->pi_kstat_ref = 1;
6128	mdi_statp->pi_kstat_iostats = kiosp;
6129	mdi_statp->pi_kstat_errstats = kerrsp;
6130	kstat_install(kiosp);
6131	kstat_install(kerrsp);
6132	MDI_PI(pip)->pi_kstats = mdi_statp;
6133	return (MDI_SUCCESS);
6134}
6135
6136/*
6137 * destroy per-path properties
6138 */
6139static void
6140i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6141{
6142
6143	struct mdi_pi_kstats *mdi_statp;
6144
6145	if (MDI_PI(pip)->pi_kstats == NULL)
6146		return;
6147	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6148		return;
6149
6150	MDI_PI(pip)->pi_kstats = NULL;
6151
6152	/*
6153	 * the kstat may be shared between multiple pathinfo nodes
6154	 * decrement this pathinfo's usage, removing the kstats
6155	 * themselves when the last pathinfo reference is removed.
6156	 */
6157	ASSERT(mdi_statp->pi_kstat_ref > 0);
6158	if (--mdi_statp->pi_kstat_ref != 0)
6159		return;
6160
6161	kstat_delete(mdi_statp->pi_kstat_iostats);
6162	kstat_delete(mdi_statp->pi_kstat_errstats);
6163	kmem_free(mdi_statp, sizeof (*mdi_statp));
6164}
6165
6166/*
6167 * update I/O paths KSTATS
6168 */
6169void
6170mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6171{
6172	kstat_t *iostatp;
6173	size_t xfer_cnt;
6174
6175	ASSERT(pip != NULL);
6176
6177	/*
6178	 * I/O can be driven across a path prior to having path
6179	 * statistics available, i.e. probe(9e).
6180	 */
6181	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6182		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6183		xfer_cnt = bp->b_bcount - bp->b_resid;
6184		if (bp->b_flags & B_READ) {
6185			KSTAT_IO_PTR(iostatp)->reads++;
6186			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6187		} else {
6188			KSTAT_IO_PTR(iostatp)->writes++;
6189			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6190		}
6191	}
6192}
6193
6194/*
6195 * Enable the path(specific client/target/initiator)
6196 * Enabling a path means that MPxIO may select the enabled path for routing
6197 * future I/O requests, subject to other path state constraints.
6198 */
6199int
6200mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6201{
6202	mdi_phci_t	*ph;
6203
6204	ph = MDI_PI(pip)->pi_phci;
6205	if (ph == NULL) {
6206		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6207		    "!failed: path %s %p: NULL ph",
6208		    mdi_pi_spathname(pip), (void *)pip));
6209		return (MDI_FAILURE);
6210	}
6211
6212	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6213		MDI_ENABLE_OP);
6214	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6215	    "!returning success pip = %p. ph = %p",
6216	    (void *)pip, (void *)ph));
6217	return (MDI_SUCCESS);
6218
6219}
6220
6221/*
6222 * Disable the path (specific client/target/initiator)
6223 * Disabling a path means that MPxIO will not select the disabled path for
6224 * routing any new I/O requests.
6225 */
6226int
6227mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6228{
6229	mdi_phci_t	*ph;
6230
6231	ph = MDI_PI(pip)->pi_phci;
6232	if (ph == NULL) {
6233		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6234		    "!failed: path %s %p: NULL ph",
6235		    mdi_pi_spathname(pip), (void *)pip));
6236		return (MDI_FAILURE);
6237	}
6238
6239	(void) i_mdi_enable_disable_path(pip,
6240	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6241	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6242	    "!returning success pip = %p. ph = %p",
6243	    (void *)pip, (void *)ph));
6244	return (MDI_SUCCESS);
6245}
6246
6247/*
6248 * disable the path to a particular pHCI (pHCI specified in the phci_path
6249 * argument) for a particular client (specified in the client_path argument).
6250 * Disabling a path means that MPxIO will not select the disabled path for
6251 * routing any new I/O requests.
6252 * NOTE: this will be removed once the NWS files are changed to use the new
6253 * mdi_{enable,disable}_path interfaces
6254 */
6255int
6256mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6257{
6258	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6259}
6260
6261/*
6262 * Enable the path to a particular pHCI (pHCI specified in the phci_path
6263 * argument) for a particular client (specified in the client_path argument).
6264 * Enabling a path means that MPxIO may select the enabled path for routing
6265 * future I/O requests, subject to other path state constraints.
6266 * NOTE: this will be removed once the NWS files are changed to use the new
6267 * mdi_{enable,disable}_path interfaces
6268 */
6269
6270int
6271mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6272{
6273	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6274}
6275
6276/*
6277 * Common routine for doing enable/disable.
6278 */
6279static mdi_pathinfo_t *
6280i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6281		int op)
6282{
6283	int		sync_flag = 0;
6284	int		rv;
6285	mdi_pathinfo_t 	*next;
6286	int		(*f)() = NULL;
6287
6288	/*
6289	 * Check to make sure the path is not already in the
6290	 * requested state. If it is just return the next path
6291	 * as we have nothing to do here.
6292	 */
6293	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6294	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6295		MDI_PI_LOCK(pip);
6296		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6297		MDI_PI_UNLOCK(pip);
6298		return (next);
6299	}
6300
6301	f = vh->vh_ops->vo_pi_state_change;
6302
6303	sync_flag = (flags << 8) & 0xf00;
6304
6305	/*
6306	 * Do a callback into the mdi consumer to let it
6307	 * know that path is about to get enabled/disabled.
6308	 */
6309	if (f != NULL) {
6310		rv = (*f)(vh->vh_dip, pip, 0,
6311			MDI_PI_EXT_STATE(pip),
6312			MDI_EXT_STATE_CHANGE | sync_flag |
6313			op | MDI_BEFORE_STATE_CHANGE);
6314		if (rv != MDI_SUCCESS) {
6315			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6316			    "vo_pi_state_change: failed rv = %x", rv));
6317		}
6318	}
6319	MDI_PI_LOCK(pip);
6320	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6321
6322	switch (flags) {
6323		case USER_DISABLE:
6324			if (op == MDI_DISABLE_OP) {
6325				MDI_PI_SET_USER_DISABLE(pip);
6326			} else {
6327				MDI_PI_SET_USER_ENABLE(pip);
6328			}
6329			break;
6330		case DRIVER_DISABLE:
6331			if (op == MDI_DISABLE_OP) {
6332				MDI_PI_SET_DRV_DISABLE(pip);
6333			} else {
6334				MDI_PI_SET_DRV_ENABLE(pip);
6335			}
6336			break;
6337		case DRIVER_DISABLE_TRANSIENT:
6338			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6339				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6340			} else {
6341				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6342			}
6343			break;
6344	}
6345	MDI_PI_UNLOCK(pip);
6346	/*
6347	 * Do a callback into the mdi consumer to let it
6348	 * know that path is now enabled/disabled.
6349	 */
6350	if (f != NULL) {
6351		rv = (*f)(vh->vh_dip, pip, 0,
6352			MDI_PI_EXT_STATE(pip),
6353			MDI_EXT_STATE_CHANGE | sync_flag |
6354			op | MDI_AFTER_STATE_CHANGE);
6355		if (rv != MDI_SUCCESS) {
6356			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6357			    "vo_pi_state_change failed: rv = %x", rv));
6358		}
6359	}
6360	return (next);
6361}
6362
6363/*
6364 * Common routine for doing enable/disable.
6365 * NOTE: this will be removed once the NWS files are changed to use the new
6366 * mdi_{enable,disable}_path has been putback
6367 */
6368int
6369i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6370{
6371
6372	mdi_phci_t	*ph;
6373	mdi_vhci_t	*vh = NULL;
6374	mdi_client_t	*ct;
6375	mdi_pathinfo_t	*next, *pip;
6376	int		found_it;
6377
6378	ph = i_devi_get_phci(pdip);
6379	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6380	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6381	    (void *)cdip));
6382	if (ph == NULL) {
6383		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6384		    "!failed: operation %d: NULL ph", op));
6385		return (MDI_FAILURE);
6386	}
6387
6388	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6389		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6390		    "!failed: invalid operation %d", op));
6391		return (MDI_FAILURE);
6392	}
6393
6394	vh = ph->ph_vhci;
6395
6396	if (cdip == NULL) {
6397		/*
6398		 * Need to mark the Phci as enabled/disabled.
6399		 */
6400		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6401		    "op %d for the phci", op));
6402		MDI_PHCI_LOCK(ph);
6403		switch (flags) {
6404			case USER_DISABLE:
6405				if (op == MDI_DISABLE_OP) {
6406					MDI_PHCI_SET_USER_DISABLE(ph);
6407				} else {
6408					MDI_PHCI_SET_USER_ENABLE(ph);
6409				}
6410				break;
6411			case DRIVER_DISABLE:
6412				if (op == MDI_DISABLE_OP) {
6413					MDI_PHCI_SET_DRV_DISABLE(ph);
6414				} else {
6415					MDI_PHCI_SET_DRV_ENABLE(ph);
6416				}
6417				break;
6418			case DRIVER_DISABLE_TRANSIENT:
6419				if (op == MDI_DISABLE_OP) {
6420					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6421				} else {
6422					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6423				}
6424				break;
6425			default:
6426				MDI_PHCI_UNLOCK(ph);
6427				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6428				    "!invalid flag argument= %d", flags));
6429		}
6430
6431		/*
6432		 * Phci has been disabled. Now try to enable/disable
6433		 * path info's to each client.
6434		 */
6435		pip = ph->ph_path_head;
6436		while (pip != NULL) {
6437			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6438		}
6439		MDI_PHCI_UNLOCK(ph);
6440	} else {
6441
6442		/*
6443		 * Disable a specific client.
6444		 */
6445		ct = i_devi_get_client(cdip);
6446		if (ct == NULL) {
6447			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6448			    "!failed: operation = %d: NULL ct", op));
6449			return (MDI_FAILURE);
6450		}
6451
6452		MDI_CLIENT_LOCK(ct);
6453		pip = ct->ct_path_head;
6454		found_it = 0;
6455		while (pip != NULL) {
6456			MDI_PI_LOCK(pip);
6457			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6458			if (MDI_PI(pip)->pi_phci == ph) {
6459				MDI_PI_UNLOCK(pip);
6460				found_it = 1;
6461				break;
6462			}
6463			MDI_PI_UNLOCK(pip);
6464			pip = next;
6465		}
6466
6467
6468		MDI_CLIENT_UNLOCK(ct);
6469		if (found_it == 0) {
6470			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6471			    "!failed. Could not find corresponding pip\n"));
6472			return (MDI_FAILURE);
6473		}
6474
6475		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6476	}
6477
6478	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6479	    "!op %d returning success pdip = %p cdip = %p",
6480	    op, (void *)pdip, (void *)cdip));
6481	return (MDI_SUCCESS);
6482}
6483
6484/*
6485 * Ensure phci powered up
6486 */
6487static void
6488i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6489{
6490	dev_info_t	*ph_dip;
6491
6492	ASSERT(pip != NULL);
6493	ASSERT(MDI_PI_LOCKED(pip));
6494
6495	if (MDI_PI(pip)->pi_pm_held) {
6496		return;
6497	}
6498
6499	ph_dip = mdi_pi_get_phci(pip);
6500	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6501	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6502	if (ph_dip == NULL) {
6503		return;
6504	}
6505
6506	MDI_PI_UNLOCK(pip);
6507	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6508	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6509	pm_hold_power(ph_dip);
6510	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6511	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6512	MDI_PI_LOCK(pip);
6513
6514	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6515	if (DEVI(ph_dip)->devi_pm_info)
6516		MDI_PI(pip)->pi_pm_held = 1;
6517}
6518
6519/*
6520 * Allow phci powered down
6521 */
6522static void
6523i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6524{
6525	dev_info_t	*ph_dip = NULL;
6526
6527	ASSERT(pip != NULL);
6528	ASSERT(MDI_PI_LOCKED(pip));
6529
6530	if (MDI_PI(pip)->pi_pm_held == 0) {
6531		return;
6532	}
6533
6534	ph_dip = mdi_pi_get_phci(pip);
6535	ASSERT(ph_dip != NULL);
6536
6537	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6538	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6539
6540	MDI_PI_UNLOCK(pip);
6541	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6542	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6543	pm_rele_power(ph_dip);
6544	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6545	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6546	MDI_PI_LOCK(pip);
6547
6548	MDI_PI(pip)->pi_pm_held = 0;
6549}
6550
6551static void
6552i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6553{
6554	ASSERT(MDI_CLIENT_LOCKED(ct));
6555
6556	ct->ct_power_cnt += incr;
6557	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6558	    "%p ct_power_cnt = %d incr = %d",
6559	    (void *)ct, ct->ct_power_cnt, incr));
6560	ASSERT(ct->ct_power_cnt >= 0);
6561}
6562
6563static void
6564i_mdi_rele_all_phci(mdi_client_t *ct)
6565{
6566	mdi_pathinfo_t  *pip;
6567
6568	ASSERT(MDI_CLIENT_LOCKED(ct));
6569	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6570	while (pip != NULL) {
6571		mdi_hold_path(pip);
6572		MDI_PI_LOCK(pip);
6573		i_mdi_pm_rele_pip(pip);
6574		MDI_PI_UNLOCK(pip);
6575		mdi_rele_path(pip);
6576		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6577	}
6578}
6579
6580static void
6581i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6582{
6583	ASSERT(MDI_CLIENT_LOCKED(ct));
6584
6585	if (i_ddi_devi_attached(ct->ct_dip)) {
6586		ct->ct_power_cnt -= decr;
6587		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6588		    "%p ct_power_cnt = %d decr = %d",
6589		    (void *)ct, ct->ct_power_cnt, decr));
6590	}
6591
6592	ASSERT(ct->ct_power_cnt >= 0);
6593	if (ct->ct_power_cnt == 0) {
6594		i_mdi_rele_all_phci(ct);
6595		return;
6596	}
6597}
6598
6599static void
6600i_mdi_pm_reset_client(mdi_client_t *ct)
6601{
6602	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6603	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6604	ASSERT(MDI_CLIENT_LOCKED(ct));
6605	ct->ct_power_cnt = 0;
6606	i_mdi_rele_all_phci(ct);
6607	ct->ct_powercnt_config = 0;
6608	ct->ct_powercnt_unconfig = 0;
6609	ct->ct_powercnt_reset = 1;
6610}
6611
6612static int
6613i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6614{
6615	int		ret;
6616	dev_info_t	*ph_dip;
6617
6618	MDI_PI_LOCK(pip);
6619	i_mdi_pm_hold_pip(pip);
6620
6621	ph_dip = mdi_pi_get_phci(pip);
6622	MDI_PI_UNLOCK(pip);
6623
6624	/* bring all components of phci to full power */
6625	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6626	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6627	    ddi_get_instance(ph_dip), (void *)pip));
6628
6629	ret = pm_powerup(ph_dip);
6630
6631	if (ret == DDI_FAILURE) {
6632		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6633		    "pm_powerup FAILED for %s%d %p",
6634		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6635		    (void *)pip));
6636
6637		MDI_PI_LOCK(pip);
6638		i_mdi_pm_rele_pip(pip);
6639		MDI_PI_UNLOCK(pip);
6640		return (MDI_FAILURE);
6641	}
6642
6643	return (MDI_SUCCESS);
6644}
6645
6646static int
6647i_mdi_power_all_phci(mdi_client_t *ct)
6648{
6649	mdi_pathinfo_t  *pip;
6650	int		succeeded = 0;
6651
6652	ASSERT(MDI_CLIENT_LOCKED(ct));
6653	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6654	while (pip != NULL) {
6655		/*
6656		 * Don't power if MDI_PATHINFO_STATE_FAULT
6657		 * or MDI_PATHINFO_STATE_OFFLINE.
6658		 */
6659		if (MDI_PI_IS_INIT(pip) ||
6660		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6661			mdi_hold_path(pip);
6662			MDI_CLIENT_UNLOCK(ct);
6663			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6664				succeeded = 1;
6665
6666			ASSERT(ct == MDI_PI(pip)->pi_client);
6667			MDI_CLIENT_LOCK(ct);
6668			mdi_rele_path(pip);
6669		}
6670		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6671	}
6672
6673	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6674}
6675
6676/*
6677 * mdi_bus_power():
6678 *		1. Place the phci(s) into powered up state so that
6679 *		   client can do power management
6680 *		2. Ensure phci powered up as client power managing
6681 * Return Values:
6682 *		MDI_SUCCESS
6683 *		MDI_FAILURE
6684 */
6685int
6686mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6687    void *arg, void *result)
6688{
6689	int			ret = MDI_SUCCESS;
6690	pm_bp_child_pwrchg_t	*bpc;
6691	mdi_client_t		*ct;
6692	dev_info_t		*cdip;
6693	pm_bp_has_changed_t	*bphc;
6694
6695	/*
6696	 * BUS_POWER_NOINVOL not supported
6697	 */
6698	if (op == BUS_POWER_NOINVOL)
6699		return (MDI_FAILURE);
6700
6701	/*
6702	 * ignore other OPs.
6703	 * return quickly to save cou cycles on the ct processing
6704	 */
6705	switch (op) {
6706	case BUS_POWER_PRE_NOTIFICATION:
6707	case BUS_POWER_POST_NOTIFICATION:
6708		bpc = (pm_bp_child_pwrchg_t *)arg;
6709		cdip = bpc->bpc_dip;
6710		break;
6711	case BUS_POWER_HAS_CHANGED:
6712		bphc = (pm_bp_has_changed_t *)arg;
6713		cdip = bphc->bphc_dip;
6714		break;
6715	default:
6716		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6717	}
6718
6719	ASSERT(MDI_CLIENT(cdip));
6720
6721	ct = i_devi_get_client(cdip);
6722	if (ct == NULL)
6723		return (MDI_FAILURE);
6724
6725	/*
6726	 * wait till the mdi_pathinfo node state change are processed
6727	 */
6728	MDI_CLIENT_LOCK(ct);
6729	switch (op) {
6730	case BUS_POWER_PRE_NOTIFICATION:
6731		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6732		    "BUS_POWER_PRE_NOTIFICATION:"
6733		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6734		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6735		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6736
6737		/* serialize power level change per client */
6738		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6739			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6740
6741		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6742
6743		if (ct->ct_power_cnt == 0) {
6744			ret = i_mdi_power_all_phci(ct);
6745		}
6746
6747		/*
6748		 * if new_level > 0:
6749		 *	- hold phci(s)
6750		 *	- power up phci(s) if not already
6751		 * ignore power down
6752		 */
6753		if (bpc->bpc_nlevel > 0) {
6754			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6755				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6756				    "i_mdi_pm_hold_client\n"));
6757				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6758			}
6759		}
6760		break;
6761	case BUS_POWER_POST_NOTIFICATION:
6762		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6763		    "BUS_POWER_POST_NOTIFICATION:"
6764		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6765		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6766		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6767		    *(int *)result));
6768
6769		if (*(int *)result == DDI_SUCCESS) {
6770			if (bpc->bpc_nlevel > 0) {
6771				MDI_CLIENT_SET_POWER_UP(ct);
6772			} else {
6773				MDI_CLIENT_SET_POWER_DOWN(ct);
6774			}
6775		}
6776
6777		/* release the hold we did in pre-notification */
6778		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6779		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6780			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6781			    "i_mdi_pm_rele_client\n"));
6782			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6783		}
6784
6785		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6786			/* another thread might started attaching */
6787			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6788				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6789				    "i_mdi_pm_rele_client\n"));
6790				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6791			/* detaching has been taken care in pm_post_unconfig */
6792			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6793				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6794				    "i_mdi_pm_reset_client\n"));
6795				i_mdi_pm_reset_client(ct);
6796			}
6797		}
6798
6799		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6800		cv_broadcast(&ct->ct_powerchange_cv);
6801
6802		break;
6803
6804	/* need to do more */
6805	case BUS_POWER_HAS_CHANGED:
6806		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6807		    "BUS_POWER_HAS_CHANGED:"
6808		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6809		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6810		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6811
6812		if (bphc->bphc_nlevel > 0 &&
6813		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6814			if (ct->ct_power_cnt == 0) {
6815				ret = i_mdi_power_all_phci(ct);
6816			}
6817			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6818			    "i_mdi_pm_hold_client\n"));
6819			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6820		}
6821
6822		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6823			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6824			    "i_mdi_pm_rele_client\n"));
6825			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6826		}
6827		break;
6828	}
6829
6830	MDI_CLIENT_UNLOCK(ct);
6831	return (ret);
6832}
6833
6834static int
6835i_mdi_pm_pre_config_one(dev_info_t *child)
6836{
6837	int		ret = MDI_SUCCESS;
6838	mdi_client_t	*ct;
6839
6840	ct = i_devi_get_client(child);
6841	if (ct == NULL)
6842		return (MDI_FAILURE);
6843
6844	MDI_CLIENT_LOCK(ct);
6845	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6846		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6847
6848	if (!MDI_CLIENT_IS_FAILED(ct)) {
6849		MDI_CLIENT_UNLOCK(ct);
6850		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6851		return (MDI_SUCCESS);
6852	}
6853
6854	if (ct->ct_powercnt_config) {
6855		MDI_CLIENT_UNLOCK(ct);
6856		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6857		return (MDI_SUCCESS);
6858	}
6859
6860	if (ct->ct_power_cnt == 0) {
6861		ret = i_mdi_power_all_phci(ct);
6862	}
6863	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6864	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6865	ct->ct_powercnt_config = 1;
6866	ct->ct_powercnt_reset = 0;
6867	MDI_CLIENT_UNLOCK(ct);
6868	return (ret);
6869}
6870
6871static int
6872i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6873{
6874	int			ret = MDI_SUCCESS;
6875	dev_info_t		*cdip;
6876	int			circ;
6877
6878	ASSERT(MDI_VHCI(vdip));
6879
6880	/* ndi_devi_config_one */
6881	if (child) {
6882		ASSERT(DEVI_BUSY_OWNED(vdip));
6883		return (i_mdi_pm_pre_config_one(child));
6884	}
6885
6886	/* devi_config_common */
6887	ndi_devi_enter(vdip, &circ);
6888	cdip = ddi_get_child(vdip);
6889	while (cdip) {
6890		dev_info_t *next = ddi_get_next_sibling(cdip);
6891
6892		ret = i_mdi_pm_pre_config_one(cdip);
6893		if (ret != MDI_SUCCESS)
6894			break;
6895		cdip = next;
6896	}
6897	ndi_devi_exit(vdip, circ);
6898	return (ret);
6899}
6900
6901static int
6902i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6903{
6904	int		ret = MDI_SUCCESS;
6905	mdi_client_t	*ct;
6906
6907	ct = i_devi_get_client(child);
6908	if (ct == NULL)
6909		return (MDI_FAILURE);
6910
6911	MDI_CLIENT_LOCK(ct);
6912	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6913		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6914
6915	if (!i_ddi_devi_attached(child)) {
6916		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
6917		MDI_CLIENT_UNLOCK(ct);
6918		return (MDI_SUCCESS);
6919	}
6920
6921	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6922	    (flags & NDI_AUTODETACH)) {
6923		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
6924		MDI_CLIENT_UNLOCK(ct);
6925		return (MDI_FAILURE);
6926	}
6927
6928	if (ct->ct_powercnt_unconfig) {
6929		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
6930		MDI_CLIENT_UNLOCK(ct);
6931		*held = 1;
6932		return (MDI_SUCCESS);
6933	}
6934
6935	if (ct->ct_power_cnt == 0) {
6936		ret = i_mdi_power_all_phci(ct);
6937	}
6938	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6939	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6940	ct->ct_powercnt_unconfig = 1;
6941	ct->ct_powercnt_reset = 0;
6942	MDI_CLIENT_UNLOCK(ct);
6943	if (ret == MDI_SUCCESS)
6944		*held = 1;
6945	return (ret);
6946}
6947
6948static int
6949i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6950    int flags)
6951{
6952	int			ret = MDI_SUCCESS;
6953	dev_info_t		*cdip;
6954	int			circ;
6955
6956	ASSERT(MDI_VHCI(vdip));
6957	*held = 0;
6958
6959	/* ndi_devi_unconfig_one */
6960	if (child) {
6961		ASSERT(DEVI_BUSY_OWNED(vdip));
6962		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6963	}
6964
6965	/* devi_unconfig_common */
6966	ndi_devi_enter(vdip, &circ);
6967	cdip = ddi_get_child(vdip);
6968	while (cdip) {
6969		dev_info_t *next = ddi_get_next_sibling(cdip);
6970
6971		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6972		cdip = next;
6973	}
6974	ndi_devi_exit(vdip, circ);
6975
6976	if (*held)
6977		ret = MDI_SUCCESS;
6978
6979	return (ret);
6980}
6981
6982static void
6983i_mdi_pm_post_config_one(dev_info_t *child)
6984{
6985	mdi_client_t	*ct;
6986
6987	ct = i_devi_get_client(child);
6988	if (ct == NULL)
6989		return;
6990
6991	MDI_CLIENT_LOCK(ct);
6992	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6993		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6994
6995	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6996		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
6997		MDI_CLIENT_UNLOCK(ct);
6998		return;
6999	}
7000
7001	/* client has not been updated */
7002	if (MDI_CLIENT_IS_FAILED(ct)) {
7003		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
7004		MDI_CLIENT_UNLOCK(ct);
7005		return;
7006	}
7007
7008	/* another thread might have powered it down or detached it */
7009	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
7010	    !DEVI_IS_ATTACHING(child)) ||
7011	    (!i_ddi_devi_attached(child) &&
7012	    !DEVI_IS_ATTACHING(child))) {
7013		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
7014		i_mdi_pm_reset_client(ct);
7015	} else {
7016		mdi_pathinfo_t  *pip, *next;
7017		int	valid_path_count = 0;
7018
7019		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
7020		pip = ct->ct_path_head;
7021		while (pip != NULL) {
7022			MDI_PI_LOCK(pip);
7023			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
7024			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
7025				valid_path_count ++;
7026			MDI_PI_UNLOCK(pip);
7027			pip = next;
7028		}
7029		i_mdi_pm_rele_client(ct, valid_path_count);
7030	}
7031	ct->ct_powercnt_config = 0;
7032	MDI_CLIENT_UNLOCK(ct);
7033}
7034
7035static void
7036i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
7037{
7038	int		circ;
7039	dev_info_t	*cdip;
7040
7041	ASSERT(MDI_VHCI(vdip));
7042
7043	/* ndi_devi_config_one */
7044	if (child) {
7045		ASSERT(DEVI_BUSY_OWNED(vdip));
7046		i_mdi_pm_post_config_one(child);
7047		return;
7048	}
7049
7050	/* devi_config_common */
7051	ndi_devi_enter(vdip, &circ);
7052	cdip = ddi_get_child(vdip);
7053	while (cdip) {
7054		dev_info_t *next = ddi_get_next_sibling(cdip);
7055
7056		i_mdi_pm_post_config_one(cdip);
7057		cdip = next;
7058	}
7059	ndi_devi_exit(vdip, circ);
7060}
7061
7062static void
7063i_mdi_pm_post_unconfig_one(dev_info_t *child)
7064{
7065	mdi_client_t	*ct;
7066
7067	ct = i_devi_get_client(child);
7068	if (ct == NULL)
7069		return;
7070
7071	MDI_CLIENT_LOCK(ct);
7072	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
7073		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
7074
7075	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
7076		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
7077		MDI_CLIENT_UNLOCK(ct);
7078		return;
7079	}
7080
7081	/* failure detaching or another thread just attached it */
7082	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
7083	    i_ddi_devi_attached(child)) ||
7084	    (!i_ddi_devi_attached(child) &&
7085	    !DEVI_IS_ATTACHING(child))) {
7086		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
7087		i_mdi_pm_reset_client(ct);
7088	} else {
7089		mdi_pathinfo_t  *pip, *next;
7090		int	valid_path_count = 0;
7091
7092		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
7093		pip = ct->ct_path_head;
7094		while (pip != NULL) {
7095			MDI_PI_LOCK(pip);
7096			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
7097			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
7098				valid_path_count ++;
7099			MDI_PI_UNLOCK(pip);
7100			pip = next;
7101		}
7102		i_mdi_pm_rele_client(ct, valid_path_count);
7103		ct->ct_powercnt_unconfig = 0;
7104	}
7105
7106	MDI_CLIENT_UNLOCK(ct);
7107}
7108
7109static void
7110i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
7111{
7112	int			circ;
7113	dev_info_t		*cdip;
7114
7115	ASSERT(MDI_VHCI(vdip));
7116
7117	if (!held) {
7118		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
7119		return;
7120	}
7121
7122	if (child) {
7123		ASSERT(DEVI_BUSY_OWNED(vdip));
7124		i_mdi_pm_post_unconfig_one(child);
7125		return;
7126	}
7127
7128	ndi_devi_enter(vdip, &circ);
7129	cdip = ddi_get_child(vdip);
7130	while (cdip) {
7131		dev_info_t *next = ddi_get_next_sibling(cdip);
7132
7133		i_mdi_pm_post_unconfig_one(cdip);
7134		cdip = next;
7135	}
7136	ndi_devi_exit(vdip, circ);
7137}
7138
7139int
7140mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
7141{
7142	int			circ, ret = MDI_SUCCESS;
7143	dev_info_t		*client_dip = NULL;
7144	mdi_client_t		*ct;
7145
7146	/*
7147	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
7148	 * Power up pHCI for the named client device.
7149	 * Note: Before the client is enumerated under vhci by phci,
7150	 * client_dip can be NULL. Then proceed to power up all the
7151	 * pHCIs.
7152	 */
7153	if (devnm != NULL) {
7154		ndi_devi_enter(vdip, &circ);
7155		client_dip = ndi_devi_findchild(vdip, devnm);
7156	}
7157
7158	MDI_DEBUG(4, (MDI_NOTE, vdip,
7159	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));
7160
7161	switch (op) {
7162	case MDI_PM_PRE_CONFIG:
7163		ret = i_mdi_pm_pre_config(vdip, client_dip);
7164		break;
7165
7166	case MDI_PM_PRE_UNCONFIG:
7167		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
7168		    flags);
7169		break;
7170
7171	case MDI_PM_POST_CONFIG:
7172		i_mdi_pm_post_config(vdip, client_dip);
7173		break;
7174
7175	case MDI_PM_POST_UNCONFIG:
7176		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
7177		break;
7178
7179	case MDI_PM_HOLD_POWER:
7180	case MDI_PM_RELE_POWER:
7181		ASSERT(args);
7182
7183		client_dip = (dev_info_t *)args;
7184		ASSERT(MDI_CLIENT(client_dip));
7185
7186		ct = i_devi_get_client(client_dip);
7187		MDI_CLIENT_LOCK(ct);
7188
7189		if (op == MDI_PM_HOLD_POWER) {
7190			if (ct->ct_power_cnt == 0) {
7191				(void) i_mdi_power_all_phci(ct);
7192				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7193				    "i_mdi_pm_hold_client\n"));
7194				i_mdi_pm_hold_client(ct, ct->ct_path_count);
7195			}
7196		} else {
7197			if (DEVI_IS_ATTACHING(client_dip)) {
7198				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7199				    "i_mdi_pm_rele_client\n"));
7200				i_mdi_pm_rele_client(ct, ct->ct_path_count);
7201			} else {
7202				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7203				    "i_mdi_pm_reset_client\n"));
7204				i_mdi_pm_reset_client(ct);
7205			}
7206		}
7207
7208		MDI_CLIENT_UNLOCK(ct);
7209		break;
7210
7211	default:
7212		break;
7213	}
7214
7215	if (devnm)
7216		ndi_devi_exit(vdip, circ);
7217
7218	return (ret);
7219}
7220
7221int
7222mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
7223{
7224	mdi_vhci_t *vhci;
7225
7226	if (!MDI_VHCI(dip))
7227		return (MDI_FAILURE);
7228
7229	if (mdi_class) {
7230		vhci = DEVI(dip)->devi_mdi_xhci;
7231		ASSERT(vhci);
7232		*mdi_class = vhci->vh_class;
7233	}
7234
7235	return (MDI_SUCCESS);
7236}
7237
7238int
7239mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
7240{
7241	mdi_phci_t *phci;
7242
7243	if (!MDI_PHCI(dip))
7244		return (MDI_FAILURE);
7245
7246	if (mdi_class) {
7247		phci = DEVI(dip)->devi_mdi_xhci;
7248		ASSERT(phci);
7249		*mdi_class = phci->ph_vhci->vh_class;
7250	}
7251
7252	return (MDI_SUCCESS);
7253}
7254
7255int
7256mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
7257{
7258	mdi_client_t *client;
7259
7260	if (!MDI_CLIENT(dip))
7261		return (MDI_FAILURE);
7262
7263	if (mdi_class) {
7264		client = DEVI(dip)->devi_mdi_client;
7265		ASSERT(client);
7266		*mdi_class = client->ct_vhci->vh_class;
7267	}
7268
7269	return (MDI_SUCCESS);
7270}
7271
7272void *
7273mdi_client_get_vhci_private(dev_info_t *dip)
7274{
7275	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7276	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7277		mdi_client_t	*ct;
7278		ct = i_devi_get_client(dip);
7279		return (ct->ct_vprivate);
7280	}
7281	return (NULL);
7282}
7283
7284void
7285mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7286{
7287	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7288	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7289		mdi_client_t	*ct;
7290		ct = i_devi_get_client(dip);
7291		ct->ct_vprivate = data;
7292	}
7293}
7294/*
7295 * mdi_pi_get_vhci_private():
7296 *		Get the vhci private information associated with the
7297 *		mdi_pathinfo node
7298 */
7299void *
7300mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7301{
7302	caddr_t	vprivate = NULL;
7303	if (pip) {
7304		vprivate = MDI_PI(pip)->pi_vprivate;
7305	}
7306	return (vprivate);
7307}
7308
7309/*
7310 * mdi_pi_set_vhci_private():
7311 *		Set the vhci private information in the mdi_pathinfo node
7312 */
7313void
7314mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7315{
7316	if (pip) {
7317		MDI_PI(pip)->pi_vprivate = priv;
7318	}
7319}
7320
7321/*
7322 * mdi_phci_get_vhci_private():
7323 *		Get the vhci private information associated with the
7324 *		mdi_phci node
7325 */
7326void *
7327mdi_phci_get_vhci_private(dev_info_t *dip)
7328{
7329	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7330	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7331		mdi_phci_t	*ph;
7332		ph = i_devi_get_phci(dip);
7333		return (ph->ph_vprivate);
7334	}
7335	return (NULL);
7336}
7337
7338/*
7339 * mdi_phci_set_vhci_private():
7340 *		Set the vhci private information in the mdi_phci node
7341 */
7342void
7343mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7344{
7345	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7346	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7347		mdi_phci_t	*ph;
7348		ph = i_devi_get_phci(dip);
7349		ph->ph_vprivate = priv;
7350	}
7351}
7352
7353int
7354mdi_pi_ishidden(mdi_pathinfo_t *pip)
7355{
7356	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
7357}
7358
7359int
7360mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
7361{
7362	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
7363}
7364
7365/* Return 1 if all client paths are device_removed */
7366static int
7367i_mdi_client_all_devices_removed(mdi_client_t *ct)
7368{
7369	mdi_pathinfo_t  *pip;
7370	int		all_devices_removed = 1;
7371
7372	MDI_CLIENT_LOCK(ct);
7373	for (pip = ct->ct_path_head; pip;
7374	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link) {
7375		if (!mdi_pi_device_isremoved(pip)) {
7376			all_devices_removed = 0;
7377			break;
7378		}
7379	}
7380	MDI_CLIENT_UNLOCK(ct);
7381	return (all_devices_removed);
7382}
7383
7384/*
7385 * When processing path hotunplug, represent device removal.
7386 */
7387int
7388mdi_pi_device_remove(mdi_pathinfo_t *pip)
7389{
7390	mdi_client_t	*ct;
7391
7392	MDI_PI_LOCK(pip);
7393	if (mdi_pi_device_isremoved(pip)) {
7394		MDI_PI_UNLOCK(pip);
7395		return (0);
7396	}
7397	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
7398	MDI_PI_FLAGS_SET_HIDDEN(pip);
7399	MDI_PI_UNLOCK(pip);
7400
7401	/*
7402	 * If all paths associated with the client are now DEVICE_REMOVED,
7403	 * reflect DEVICE_REMOVED in the client.
7404	 */
7405	ct = MDI_PI(pip)->pi_client;
7406	if (ct && ct->ct_dip && i_mdi_client_all_devices_removed(ct))
7407		(void) ndi_devi_device_remove(ct->ct_dip);
7408	else
7409		i_ddi_di_cache_invalidate();
7410
7411	return (1);
7412}
7413
7414/*
7415 * When processing hotplug, if a path marked mdi_pi_device_isremoved()
7416 * is now accessible then this interfaces is used to represent device insertion.
7417 */
7418int
7419mdi_pi_device_insert(mdi_pathinfo_t *pip)
7420{
7421	MDI_PI_LOCK(pip);
7422	if (!mdi_pi_device_isremoved(pip)) {
7423		MDI_PI_UNLOCK(pip);
7424		return (0);
7425	}
7426	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
7427	MDI_PI_FLAGS_CLR_HIDDEN(pip);
7428	MDI_PI_UNLOCK(pip);
7429
7430	i_ddi_di_cache_invalidate();
7431
7432	return (1);
7433}
7434
7435/*
7436 * List of vhci class names:
7437 * A vhci class name must be in this list only if the corresponding vhci
7438 * driver intends to use the mdi provided bus config implementation
7439 * (i.e., mdi_vhci_bus_config()).
7440 */
7441static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7442#define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7443
7444/*
7445 * During boot time, the on-disk vhci cache for every vhci class is read
7446 * in the form of an nvlist and stored here.
7447 */
7448static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7449
7450/* nvpair names in vhci cache nvlist */
7451#define	MDI_VHCI_CACHE_VERSION	1
7452#define	MDI_NVPNAME_VERSION	"version"
7453#define	MDI_NVPNAME_PHCIS	"phcis"
7454#define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7455
7456/*
7457 * Given vhci class name, return its on-disk vhci cache filename.
7458 * Memory for the returned filename which includes the full path is allocated
7459 * by this function.
7460 */
7461static char *
7462vhclass2vhcache_filename(char *vhclass)
7463{
7464	char *filename;
7465	int len;
7466	static char *fmt = "/etc/devices/mdi_%s_cache";
7467
7468	/*
7469	 * fmt contains the on-disk vhci cache file name format;
7470	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7471	 */
7472
7473	/* the -1 below is to account for "%s" in the format string */
7474	len = strlen(fmt) + strlen(vhclass) - 1;
7475	filename = kmem_alloc(len, KM_SLEEP);
7476	(void) snprintf(filename, len, fmt, vhclass);
7477	ASSERT(len == (strlen(filename) + 1));
7478	return (filename);
7479}
7480
7481/*
7482 * initialize the vhci cache related data structures and read the on-disk
7483 * vhci cached data into memory.
7484 */
7485static void
7486setup_vhci_cache(mdi_vhci_t *vh)
7487{
7488	mdi_vhci_config_t *vhc;
7489	mdi_vhci_cache_t *vhcache;
7490	int i;
7491	nvlist_t *nvl = NULL;
7492
7493	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7494	vh->vh_config = vhc;
7495	vhcache = &vhc->vhc_vhcache;
7496
7497	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7498
7499	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7500	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7501
7502	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7503
7504	/*
7505	 * Create string hash; same as mod_hash_create_strhash() except that
7506	 * we use NULL key destructor.
7507	 */
7508	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7509	    mdi_bus_config_cache_hash_size,
7510	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7511	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7512
7513	/*
7514	 * The on-disk vhci cache is read during booting prior to the
7515	 * lights-out period by mdi_read_devices_files().
7516	 */
7517	for (i = 0; i < N_VHCI_CLASSES; i++) {
7518		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7519			nvl = vhcache_nvl[i];
7520			vhcache_nvl[i] = NULL;
7521			break;
7522		}
7523	}
7524
7525	/*
7526	 * this is to cover the case of some one manually causing unloading
7527	 * (or detaching) and reloading (or attaching) of a vhci driver.
7528	 */
7529	if (nvl == NULL && modrootloaded)
7530		nvl = read_on_disk_vhci_cache(vh->vh_class);
7531
7532	if (nvl != NULL) {
7533		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7534		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7535			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7536		else  {
7537			cmn_err(CE_WARN,
7538			    "%s: data file corrupted, will recreate",
7539			    vhc->vhc_vhcache_filename);
7540		}
7541		rw_exit(&vhcache->vhcache_lock);
7542		nvlist_free(nvl);
7543	}
7544
7545	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7546	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7547
7548	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7549	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7550}
7551
7552/*
7553 * free all vhci cache related resources
7554 */
7555static int
7556destroy_vhci_cache(mdi_vhci_t *vh)
7557{
7558	mdi_vhci_config_t *vhc = vh->vh_config;
7559	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7560	mdi_vhcache_phci_t *cphci, *cphci_next;
7561	mdi_vhcache_client_t *cct, *cct_next;
7562	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7563
7564	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7565		return (MDI_FAILURE);
7566
7567	kmem_free(vhc->vhc_vhcache_filename,
7568	    strlen(vhc->vhc_vhcache_filename) + 1);
7569
7570	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7571
7572	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7573	    cphci = cphci_next) {
7574		cphci_next = cphci->cphci_next;
7575		free_vhcache_phci(cphci);
7576	}
7577
7578	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7579		cct_next = cct->cct_next;
7580		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7581			cpi_next = cpi->cpi_next;
7582			free_vhcache_pathinfo(cpi);
7583		}
7584		free_vhcache_client(cct);
7585	}
7586
7587	rw_destroy(&vhcache->vhcache_lock);
7588
7589	mutex_destroy(&vhc->vhc_lock);
7590	cv_destroy(&vhc->vhc_cv);
7591	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7592	return (MDI_SUCCESS);
7593}
7594
7595/*
7596 * Stop all vhci cache related async threads and free their resources.
7597 */
7598static int
7599stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7600{
7601	mdi_async_client_config_t *acc, *acc_next;
7602
7603	mutex_enter(&vhc->vhc_lock);
7604	vhc->vhc_flags |= MDI_VHC_EXIT;
7605	ASSERT(vhc->vhc_acc_thrcount >= 0);
7606	cv_broadcast(&vhc->vhc_cv);
7607
7608	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7609	    vhc->vhc_acc_thrcount != 0) {
7610		mutex_exit(&vhc->vhc_lock);
7611		delay_random(mdi_delay);
7612		mutex_enter(&vhc->vhc_lock);
7613	}
7614
7615	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7616
7617	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7618		acc_next = acc->acc_next;
7619		free_async_client_config(acc);
7620	}
7621	vhc->vhc_acc_list_head = NULL;
7622	vhc->vhc_acc_list_tail = NULL;
7623	vhc->vhc_acc_count = 0;
7624
7625	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7626		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7627		mutex_exit(&vhc->vhc_lock);
7628		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7629			vhcache_dirty(vhc);
7630			return (MDI_FAILURE);
7631		}
7632	} else
7633		mutex_exit(&vhc->vhc_lock);
7634
7635	if (callb_delete(vhc->vhc_cbid) != 0)
7636		return (MDI_FAILURE);
7637
7638	return (MDI_SUCCESS);
7639}
7640
7641/*
7642 * Stop vhci cache flush thread
7643 */
7644/* ARGSUSED */
7645static boolean_t
7646stop_vhcache_flush_thread(void *arg, int code)
7647{
7648	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7649
7650	mutex_enter(&vhc->vhc_lock);
7651	vhc->vhc_flags |= MDI_VHC_EXIT;
7652	cv_broadcast(&vhc->vhc_cv);
7653
7654	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7655		mutex_exit(&vhc->vhc_lock);
7656		delay_random(mdi_delay);
7657		mutex_enter(&vhc->vhc_lock);
7658	}
7659
7660	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7661		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7662		mutex_exit(&vhc->vhc_lock);
7663		(void) flush_vhcache(vhc, 1);
7664	} else
7665		mutex_exit(&vhc->vhc_lock);
7666
7667	return (B_TRUE);
7668}
7669
7670/*
7671 * Enqueue the vhcache phci (cphci) at the tail of the list
7672 */
7673static void
7674enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7675{
7676	cphci->cphci_next = NULL;
7677	if (vhcache->vhcache_phci_head == NULL)
7678		vhcache->vhcache_phci_head = cphci;
7679	else
7680		vhcache->vhcache_phci_tail->cphci_next = cphci;
7681	vhcache->vhcache_phci_tail = cphci;
7682}
7683
7684/*
7685 * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7686 */
7687static void
7688enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7689    mdi_vhcache_pathinfo_t *cpi)
7690{
7691	cpi->cpi_next = NULL;
7692	if (cct->cct_cpi_head == NULL)
7693		cct->cct_cpi_head = cpi;
7694	else
7695		cct->cct_cpi_tail->cpi_next = cpi;
7696	cct->cct_cpi_tail = cpi;
7697}
7698
7699/*
7700 * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7701 * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7702 * flag set come at the beginning of the list. All cpis which have this
7703 * flag set come at the end of the list.
7704 */
7705static void
7706enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7707    mdi_vhcache_pathinfo_t *newcpi)
7708{
7709	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7710
7711	if (cct->cct_cpi_head == NULL ||
7712	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7713		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7714	else {
7715		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7716		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7717		    prev_cpi = cpi, cpi = cpi->cpi_next)
7718			;
7719
7720		if (prev_cpi == NULL)
7721			cct->cct_cpi_head = newcpi;
7722		else
7723			prev_cpi->cpi_next = newcpi;
7724
7725		newcpi->cpi_next = cpi;
7726
7727		if (cpi == NULL)
7728			cct->cct_cpi_tail = newcpi;
7729	}
7730}
7731
7732/*
7733 * Enqueue the vhcache client (cct) at the tail of the list
7734 */
7735static void
7736enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7737    mdi_vhcache_client_t *cct)
7738{
7739	cct->cct_next = NULL;
7740	if (vhcache->vhcache_client_head == NULL)
7741		vhcache->vhcache_client_head = cct;
7742	else
7743		vhcache->vhcache_client_tail->cct_next = cct;
7744	vhcache->vhcache_client_tail = cct;
7745}
7746
7747static void
7748free_string_array(char **str, int nelem)
7749{
7750	int i;
7751
7752	if (str) {
7753		for (i = 0; i < nelem; i++) {
7754			if (str[i])
7755				kmem_free(str[i], strlen(str[i]) + 1);
7756		}
7757		kmem_free(str, sizeof (char *) * nelem);
7758	}
7759}
7760
7761static void
7762free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7763{
7764	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7765	kmem_free(cphci, sizeof (*cphci));
7766}
7767
7768static void
7769free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7770{
7771	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7772	kmem_free(cpi, sizeof (*cpi));
7773}
7774
7775static void
7776free_vhcache_client(mdi_vhcache_client_t *cct)
7777{
7778	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7779	kmem_free(cct, sizeof (*cct));
7780}
7781
7782static char *
7783vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7784{
7785	char *name_addr;
7786	int len;
7787
7788	len = strlen(ct_name) + strlen(ct_addr) + 2;
7789	name_addr = kmem_alloc(len, KM_SLEEP);
7790	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7791
7792	if (ret_len)
7793		*ret_len = len;
7794	return (name_addr);
7795}
7796
7797/*
7798 * Copy the contents of paddrnvl to vhci cache.
7799 * paddrnvl nvlist contains path information for a vhci client.
7800 * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7801 */
7802static void
7803paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7804    mdi_vhcache_client_t *cct)
7805{
7806	nvpair_t *nvp = NULL;
7807	mdi_vhcache_pathinfo_t *cpi;
7808	uint_t nelem;
7809	uint32_t *val;
7810
7811	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7812		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7813		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7814		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7815		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7816		ASSERT(nelem == 2);
7817		cpi->cpi_cphci = cphci_list[val[0]];
7818		cpi->cpi_flags = val[1];
7819		enqueue_tail_vhcache_pathinfo(cct, cpi);
7820	}
7821}
7822
7823/*
7824 * Copy the contents of caddrmapnvl to vhci cache.
7825 * caddrmapnvl nvlist contains vhci client address to phci client address
7826 * mappings. See the comment in mainnvl_to_vhcache() for the format of
7827 * this nvlist.
7828 */
7829static void
7830caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7831    mdi_vhcache_phci_t *cphci_list[])
7832{
7833	nvpair_t *nvp = NULL;
7834	nvlist_t *paddrnvl;
7835	mdi_vhcache_client_t *cct;
7836
7837	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7838		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7839		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7840		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7841		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7842		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7843		/* the client must contain at least one path */
7844		ASSERT(cct->cct_cpi_head != NULL);
7845
7846		enqueue_vhcache_client(vhcache, cct);
7847		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7848		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7849	}
7850}
7851
7852/*
7853 * Copy the contents of the main nvlist to vhci cache.
7854 *
7855 * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7856 * The nvlist contains the mappings between the vhci client addresses and
7857 * their corresponding phci client addresses.
7858 *
7859 * The structure of the nvlist is as follows:
7860 *
7861 * Main nvlist:
7862 *	NAME		TYPE		DATA
7863 *	version		int32		version number
7864 *	phcis		string array	array of phci paths
7865 *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7866 *
7867 * structure of c2paddrs_nvl:
7868 *	NAME		TYPE		DATA
7869 *	caddr1		nvlist_t	paddrs_nvl1
7870 *	caddr2		nvlist_t	paddrs_nvl2
7871 *	...
7872 * where caddr1, caddr2, ... are vhci client name and addresses in the
7873 * form of "<clientname>@<clientaddress>".
7874 * (for example: "ssd@2000002037cd9f72");
7875 * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7876 *
7877 * structure of paddrs_nvl:
7878 *	NAME		TYPE		DATA
7879 *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7880 *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7881 *	...
7882 * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7883 * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7884 * phci-ids are integers that identify pHCIs to which the
7885 * the bus specific address belongs to. These integers are used as an index
7886 * into to the phcis string array in the main nvlist to get the pHCI path.
7887 */
7888static int
7889mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7890{
7891	char **phcis, **phci_namep;
7892	uint_t nphcis;
7893	mdi_vhcache_phci_t *cphci, **cphci_list;
7894	nvlist_t *caddrmapnvl;
7895	int32_t ver;
7896	int i;
7897	size_t cphci_list_size;
7898
7899	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7900
7901	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7902	    ver != MDI_VHCI_CACHE_VERSION)
7903		return (MDI_FAILURE);
7904
7905	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7906	    &nphcis) != 0)
7907		return (MDI_SUCCESS);
7908
7909	ASSERT(nphcis > 0);
7910
7911	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7912	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7913	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7914		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7915		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7916		enqueue_vhcache_phci(vhcache, cphci);
7917		cphci_list[i] = cphci;
7918	}
7919
7920	ASSERT(vhcache->vhcache_phci_head != NULL);
7921
7922	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7923		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7924
7925	kmem_free(cphci_list, cphci_list_size);
7926	return (MDI_SUCCESS);
7927}
7928
7929/*
7930 * Build paddrnvl for the specified client using the information in the
7931 * vhci cache and add it to the caddrmapnnvl.
7932 * Returns 0 on success, errno on failure.
7933 */
7934static int
7935vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7936    nvlist_t *caddrmapnvl)
7937{
7938	mdi_vhcache_pathinfo_t *cpi;
7939	nvlist_t *nvl;
7940	int err;
7941	uint32_t val[2];
7942
7943	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7944
7945	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7946		return (err);
7947
7948	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7949		val[0] = cpi->cpi_cphci->cphci_id;
7950		val[1] = cpi->cpi_flags;
7951		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7952		    != 0)
7953			goto out;
7954	}
7955
7956	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7957out:
7958	nvlist_free(nvl);
7959	return (err);
7960}
7961
7962/*
7963 * Build caddrmapnvl using the information in the vhci cache
7964 * and add it to the mainnvl.
7965 * Returns 0 on success, errno on failure.
7966 */
7967static int
7968vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7969{
7970	mdi_vhcache_client_t *cct;
7971	nvlist_t *nvl;
7972	int err;
7973
7974	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7975
7976	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7977		return (err);
7978
7979	for (cct = vhcache->vhcache_client_head; cct != NULL;
7980	    cct = cct->cct_next) {
7981		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7982			goto out;
7983	}
7984
7985	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7986out:
7987	nvlist_free(nvl);
7988	return (err);
7989}
7990
7991/*
7992 * Build nvlist using the information in the vhci cache.
7993 * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7994 * Returns nvl on success, NULL on failure.
7995 */
7996static nvlist_t *
7997vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7998{
7999	mdi_vhcache_phci_t *cphci;
8000	uint_t phci_count;
8001	char **phcis;
8002	nvlist_t *nvl;
8003	int err, i;
8004
8005	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
8006		nvl = NULL;
8007		goto out;
8008	}
8009
8010	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
8011	    MDI_VHCI_CACHE_VERSION)) != 0)
8012		goto out;
8013
8014	rw_enter(&vhcache->vhcache_lock, RW_READER);
8015	if (vhcache->vhcache_phci_head == NULL) {
8016		rw_exit(&vhcache->vhcache_lock);
8017		return (nvl);
8018	}
8019
8020	phci_count = 0;
8021	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8022	    cphci = cphci->cphci_next)
8023		cphci->cphci_id = phci_count++;
8024
8025	/* build phci pathname list */
8026	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
8027	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
8028	    cphci = cphci->cphci_next, i++)
8029		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
8030
8031	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
8032	    phci_count);
8033	free_string_array(phcis, phci_count);
8034
8035	if (err == 0 &&
8036	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
8037		rw_exit(&vhcache->vhcache_lock);
8038		return (nvl);
8039	}
8040
8041	rw_exit(&vhcache->vhcache_lock);
8042out:
8043	if (nvl)
8044		nvlist_free(nvl);
8045	return (NULL);
8046}
8047
8048/*
8049 * Lookup vhcache phci structure for the specified phci path.
8050 */
8051static mdi_vhcache_phci_t *
8052lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
8053{
8054	mdi_vhcache_phci_t *cphci;
8055
8056	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8057
8058	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8059	    cphci = cphci->cphci_next) {
8060		if (strcmp(cphci->cphci_path, phci_path) == 0)
8061			return (cphci);
8062	}
8063
8064	return (NULL);
8065}
8066
8067/*
8068 * Lookup vhcache phci structure for the specified phci.
8069 */
8070static mdi_vhcache_phci_t *
8071lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
8072{
8073	mdi_vhcache_phci_t *cphci;
8074
8075	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8076
8077	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8078	    cphci = cphci->cphci_next) {
8079		if (cphci->cphci_phci == ph)
8080			return (cphci);
8081	}
8082
8083	return (NULL);
8084}
8085
8086/*
8087 * Add the specified phci to the vhci cache if not already present.
8088 */
8089static void
8090vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
8091{
8092	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8093	mdi_vhcache_phci_t *cphci;
8094	char *pathname;
8095	int cache_updated;
8096
8097	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8098
8099	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8100	(void) ddi_pathname(ph->ph_dip, pathname);
8101	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
8102	    != NULL) {
8103		cphci->cphci_phci = ph;
8104		cache_updated = 0;
8105	} else {
8106		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
8107		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
8108		cphci->cphci_phci = ph;
8109		enqueue_vhcache_phci(vhcache, cphci);
8110		cache_updated = 1;
8111	}
8112
8113	rw_exit(&vhcache->vhcache_lock);
8114
8115	/*
8116	 * Since a new phci has been added, reset
8117	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
8118	 * during next vhcache_discover_paths().
8119	 */
8120	mutex_enter(&vhc->vhc_lock);
8121	vhc->vhc_path_discovery_cutoff_time = 0;
8122	mutex_exit(&vhc->vhc_lock);
8123
8124	kmem_free(pathname, MAXPATHLEN);
8125	if (cache_updated)
8126		vhcache_dirty(vhc);
8127}
8128
8129/*
8130 * Remove the reference to the specified phci from the vhci cache.
8131 */
8132static void
8133vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
8134{
8135	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8136	mdi_vhcache_phci_t *cphci;
8137
8138	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8139	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
8140		/* do not remove the actual mdi_vhcache_phci structure */
8141		cphci->cphci_phci = NULL;
8142	}
8143	rw_exit(&vhcache->vhcache_lock);
8144}
8145
8146static void
8147init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
8148    mdi_vhcache_lookup_token_t *src)
8149{
8150	if (src == NULL) {
8151		dst->lt_cct = NULL;
8152		dst->lt_cct_lookup_time = 0;
8153	} else {
8154		dst->lt_cct = src->lt_cct;
8155		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
8156	}
8157}
8158
8159/*
8160 * Look up vhcache client for the specified client.
8161 */
8162static mdi_vhcache_client_t *
8163lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
8164    mdi_vhcache_lookup_token_t *token)
8165{
8166	mod_hash_val_t hv;
8167	char *name_addr;
8168	int len;
8169
8170	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8171
8172	/*
8173	 * If no vhcache clean occurred since the last lookup, we can
8174	 * simply return the cct from the last lookup operation.
8175	 * It works because ccts are never freed except during the vhcache
8176	 * cleanup operation.
8177	 */
8178	if (token != NULL &&
8179	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
8180		return (token->lt_cct);
8181
8182	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
8183	if (mod_hash_find(vhcache->vhcache_client_hash,
8184	    (mod_hash_key_t)name_addr, &hv) == 0) {
8185		if (token) {
8186			token->lt_cct = (mdi_vhcache_client_t *)hv;
8187			token->lt_cct_lookup_time = ddi_get_lbolt64();
8188		}
8189	} else {
8190		if (token) {
8191			token->lt_cct = NULL;
8192			token->lt_cct_lookup_time = 0;
8193		}
8194		hv = NULL;
8195	}
8196	kmem_free(name_addr, len);
8197	return ((mdi_vhcache_client_t *)hv);
8198}
8199
8200/*
8201 * Add the specified path to the vhci cache if not already present.
8202 * Also add the vhcache client for the client corresponding to this path
8203 * if it doesn't already exist.
8204 */
8205static void
8206vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8207{
8208	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8209	mdi_vhcache_client_t *cct;
8210	mdi_vhcache_pathinfo_t *cpi;
8211	mdi_phci_t *ph = pip->pi_phci;
8212	mdi_client_t *ct = pip->pi_client;
8213	int cache_updated = 0;
8214
8215	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8216
8217	/* if vhcache client for this pip doesn't already exist, add it */
8218	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8219	    NULL)) == NULL) {
8220		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
8221		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
8222		    ct->ct_guid, NULL);
8223		enqueue_vhcache_client(vhcache, cct);
8224		(void) mod_hash_insert(vhcache->vhcache_client_hash,
8225		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
8226		cache_updated = 1;
8227	}
8228
8229	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8230		if (cpi->cpi_cphci->cphci_phci == ph &&
8231		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
8232			cpi->cpi_pip = pip;
8233			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
8234				cpi->cpi_flags &=
8235				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8236				sort_vhcache_paths(cct);
8237				cache_updated = 1;
8238			}
8239			break;
8240		}
8241	}
8242
8243	if (cpi == NULL) {
8244		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
8245		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
8246		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
8247		ASSERT(cpi->cpi_cphci != NULL);
8248		cpi->cpi_pip = pip;
8249		enqueue_vhcache_pathinfo(cct, cpi);
8250		cache_updated = 1;
8251	}
8252
8253	rw_exit(&vhcache->vhcache_lock);
8254
8255	if (cache_updated)
8256		vhcache_dirty(vhc);
8257}
8258
8259/*
8260 * Remove the reference to the specified path from the vhci cache.
8261 */
8262static void
8263vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8264{
8265	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8266	mdi_client_t *ct = pip->pi_client;
8267	mdi_vhcache_client_t *cct;
8268	mdi_vhcache_pathinfo_t *cpi;
8269
8270	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8271	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8272	    NULL)) != NULL) {
8273		for (cpi = cct->cct_cpi_head; cpi != NULL;
8274		    cpi = cpi->cpi_next) {
8275			if (cpi->cpi_pip == pip) {
8276				cpi->cpi_pip = NULL;
8277				break;
8278			}
8279		}
8280	}
8281	rw_exit(&vhcache->vhcache_lock);
8282}
8283
8284/*
8285 * Flush the vhci cache to disk.
8286 * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
8287 */
8288static int
8289flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
8290{
8291	nvlist_t *nvl;
8292	int err;
8293	int rv;
8294
8295	/*
8296	 * It is possible that the system may shutdown before
8297	 * i_ddi_io_initialized (during stmsboot for example). To allow for
8298	 * flushing the cache in this case do not check for
8299	 * i_ddi_io_initialized when force flag is set.
8300	 */
8301	if (force_flag == 0 && !i_ddi_io_initialized())
8302		return (MDI_FAILURE);
8303
8304	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
8305		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
8306		nvlist_free(nvl);
8307	} else
8308		err = EFAULT;
8309
8310	rv = MDI_SUCCESS;
8311	mutex_enter(&vhc->vhc_lock);
8312	if (err != 0) {
8313		if (err == EROFS) {
8314			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
8315			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
8316			    MDI_VHC_VHCACHE_DIRTY);
8317		} else {
8318			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
8319				cmn_err(CE_CONT, "%s: update failed\n",
8320				    vhc->vhc_vhcache_filename);
8321				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
8322			}
8323			rv = MDI_FAILURE;
8324		}
8325	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
8326		cmn_err(CE_CONT,
8327		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
8328		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
8329	}
8330	mutex_exit(&vhc->vhc_lock);
8331
8332	return (rv);
8333}
8334
8335/*
8336 * Call flush_vhcache() to flush the vhci cache at the scheduled time.
8337 * Exits itself if left idle for the idle timeout period.
8338 */
8339static void
8340vhcache_flush_thread(void *arg)
8341{
8342	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8343	clock_t idle_time, quit_at_ticks;
8344	callb_cpr_t cprinfo;
8345
8346	/* number of seconds to sleep idle before exiting */
8347	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
8348
8349	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8350	    "mdi_vhcache_flush");
8351	mutex_enter(&vhc->vhc_lock);
8352	for (; ; ) {
8353		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8354		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8355			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8356				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8357				(void) cv_timedwait(&vhc->vhc_cv,
8358				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8359				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8360			} else {
8361				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8362				mutex_exit(&vhc->vhc_lock);
8363
8364				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8365					vhcache_dirty(vhc);
8366
8367				mutex_enter(&vhc->vhc_lock);
8368			}
8369		}
8370
8371		quit_at_ticks = ddi_get_lbolt() + idle_time;
8372
8373		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8374		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8375		    ddi_get_lbolt() < quit_at_ticks) {
8376			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8377			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8378			    quit_at_ticks);
8379			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8380		}
8381
8382		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8383		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8384			goto out;
8385	}
8386
8387out:
8388	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8389	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8390	CALLB_CPR_EXIT(&cprinfo);
8391}
8392
8393/*
8394 * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8395 */
8396static void
8397vhcache_dirty(mdi_vhci_config_t *vhc)
8398{
8399	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8400	int create_thread;
8401
8402	rw_enter(&vhcache->vhcache_lock, RW_READER);
8403	/* do not flush cache until the cache is fully built */
8404	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8405		rw_exit(&vhcache->vhcache_lock);
8406		return;
8407	}
8408	rw_exit(&vhcache->vhcache_lock);
8409
8410	mutex_enter(&vhc->vhc_lock);
8411	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8412		mutex_exit(&vhc->vhc_lock);
8413		return;
8414	}
8415
8416	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8417	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8418	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8419	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8420		cv_broadcast(&vhc->vhc_cv);
8421		create_thread = 0;
8422	} else {
8423		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8424		create_thread = 1;
8425	}
8426	mutex_exit(&vhc->vhc_lock);
8427
8428	if (create_thread)
8429		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8430		    0, &p0, TS_RUN, minclsyspri);
8431}
8432
8433/*
8434 * phci bus config structure - one for for each phci bus config operation that
8435 * we initiate on behalf of a vhci.
8436 */
8437typedef struct mdi_phci_bus_config_s {
8438	char *phbc_phci_path;
8439	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8440	struct mdi_phci_bus_config_s *phbc_next;
8441} mdi_phci_bus_config_t;
8442
8443/* vhci bus config structure - one for each vhci bus config operation */
8444typedef struct mdi_vhci_bus_config_s {
8445	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8446	major_t vhbc_op_major;		/* bus config op major */
8447	uint_t vhbc_op_flags;		/* bus config op flags */
8448	kmutex_t vhbc_lock;
8449	kcondvar_t vhbc_cv;
8450	int vhbc_thr_count;
8451} mdi_vhci_bus_config_t;
8452
8453/*
8454 * bus config the specified phci
8455 */
8456static void
8457bus_config_phci(void *arg)
8458{
8459	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8460	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8461	dev_info_t *ph_dip;
8462
8463	/*
8464	 * first configure all path components upto phci and then configure
8465	 * the phci children.
8466	 */
8467	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8468	    != NULL) {
8469		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8470		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8471			(void) ndi_devi_config_driver(ph_dip,
8472			    vhbc->vhbc_op_flags,
8473			    vhbc->vhbc_op_major);
8474		} else
8475			(void) ndi_devi_config(ph_dip,
8476			    vhbc->vhbc_op_flags);
8477
8478		/* release the hold that e_ddi_hold_devi_by_path() placed */
8479		ndi_rele_devi(ph_dip);
8480	}
8481
8482	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8483	kmem_free(phbc, sizeof (*phbc));
8484
8485	mutex_enter(&vhbc->vhbc_lock);
8486	vhbc->vhbc_thr_count--;
8487	if (vhbc->vhbc_thr_count == 0)
8488		cv_broadcast(&vhbc->vhbc_cv);
8489	mutex_exit(&vhbc->vhbc_lock);
8490}
8491
8492/*
8493 * Bus config all phcis associated with the vhci in parallel.
8494 * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8495 */
8496static void
8497bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8498    ddi_bus_config_op_t op, major_t maj)
8499{
8500	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8501	mdi_vhci_bus_config_t *vhbc;
8502	mdi_vhcache_phci_t *cphci;
8503
8504	rw_enter(&vhcache->vhcache_lock, RW_READER);
8505	if (vhcache->vhcache_phci_head == NULL) {
8506		rw_exit(&vhcache->vhcache_lock);
8507		return;
8508	}
8509
8510	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8511
8512	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8513	    cphci = cphci->cphci_next) {
8514		/* skip phcis that haven't attached before root is available */
8515		if (!modrootloaded && (cphci->cphci_phci == NULL))
8516			continue;
8517		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8518		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8519		    KM_SLEEP);
8520		phbc->phbc_vhbusconfig = vhbc;
8521		phbc->phbc_next = phbc_head;
8522		phbc_head = phbc;
8523		vhbc->vhbc_thr_count++;
8524	}
8525	rw_exit(&vhcache->vhcache_lock);
8526
8527	vhbc->vhbc_op = op;
8528	vhbc->vhbc_op_major = maj;
8529	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8530	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8531	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8532	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8533
8534	/* now create threads to initiate bus config on all phcis in parallel */
8535	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8536		phbc_next = phbc->phbc_next;
8537		if (mdi_mtc_off)
8538			bus_config_phci((void *)phbc);
8539		else
8540			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8541			    0, &p0, TS_RUN, minclsyspri);
8542	}
8543
8544	mutex_enter(&vhbc->vhbc_lock);
8545	/* wait until all threads exit */
8546	while (vhbc->vhbc_thr_count > 0)
8547		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8548	mutex_exit(&vhbc->vhbc_lock);
8549
8550	mutex_destroy(&vhbc->vhbc_lock);
8551	cv_destroy(&vhbc->vhbc_cv);
8552	kmem_free(vhbc, sizeof (*vhbc));
8553}
8554
8555/*
8556 * Single threaded version of bus_config_all_phcis()
8557 */
8558static void
8559st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8560    ddi_bus_config_op_t op, major_t maj)
8561{
8562	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8563
8564	single_threaded_vhconfig_enter(vhc);
8565	bus_config_all_phcis(vhcache, flags, op, maj);
8566	single_threaded_vhconfig_exit(vhc);
8567}
8568
8569/*
8570 * Perform BUS_CONFIG_ONE on the specified child of the phci.
8571 * The path includes the child component in addition to the phci path.
8572 */
8573static int
8574bus_config_one_phci_child(char *path)
8575{
8576	dev_info_t *ph_dip, *child;
8577	char *devnm;
8578	int rv = MDI_FAILURE;
8579
8580	/* extract the child component of the phci */
8581	devnm = strrchr(path, '/');
8582	*devnm++ = '\0';
8583
8584	/*
8585	 * first configure all path components upto phci and then
8586	 * configure the phci child.
8587	 */
8588	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8589		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8590		    NDI_SUCCESS) {
8591			/*
8592			 * release the hold that ndi_devi_config_one() placed
8593			 */
8594			ndi_rele_devi(child);
8595			rv = MDI_SUCCESS;
8596		}
8597
8598		/* release the hold that e_ddi_hold_devi_by_path() placed */
8599		ndi_rele_devi(ph_dip);
8600	}
8601
8602	devnm--;
8603	*devnm = '/';
8604	return (rv);
8605}
8606
8607/*
8608 * Build a list of phci client paths for the specified vhci client.
8609 * The list includes only those phci client paths which aren't configured yet.
8610 */
8611static mdi_phys_path_t *
8612build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8613{
8614	mdi_vhcache_pathinfo_t *cpi;
8615	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8616	int config_path, len;
8617
8618	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8619		/*
8620		 * include only those paths that aren't configured.
8621		 */
8622		config_path = 0;
8623		if (cpi->cpi_pip == NULL)
8624			config_path = 1;
8625		else {
8626			MDI_PI_LOCK(cpi->cpi_pip);
8627			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8628				config_path = 1;
8629			MDI_PI_UNLOCK(cpi->cpi_pip);
8630		}
8631
8632		if (config_path) {
8633			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8634			len = strlen(cpi->cpi_cphci->cphci_path) +
8635			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8636			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8637			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8638			    cpi->cpi_cphci->cphci_path, ct_name,
8639			    cpi->cpi_addr);
8640			pp->phys_path_next = NULL;
8641
8642			if (pp_head == NULL)
8643				pp_head = pp;
8644			else
8645				pp_tail->phys_path_next = pp;
8646			pp_tail = pp;
8647		}
8648	}
8649
8650	return (pp_head);
8651}
8652
8653/*
8654 * Free the memory allocated for phci client path list.
8655 */
8656static void
8657free_phclient_path_list(mdi_phys_path_t *pp_head)
8658{
8659	mdi_phys_path_t *pp, *pp_next;
8660
8661	for (pp = pp_head; pp != NULL; pp = pp_next) {
8662		pp_next = pp->phys_path_next;
8663		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8664		kmem_free(pp, sizeof (*pp));
8665	}
8666}
8667
8668/*
8669 * Allocated async client structure and initialize with the specified values.
8670 */
8671static mdi_async_client_config_t *
8672alloc_async_client_config(char *ct_name, char *ct_addr,
8673    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8674{
8675	mdi_async_client_config_t *acc;
8676
8677	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8678	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8679	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8680	acc->acc_phclient_path_list_head = pp_head;
8681	init_vhcache_lookup_token(&acc->acc_token, tok);
8682	acc->acc_next = NULL;
8683	return (acc);
8684}
8685
8686/*
8687 * Free the memory allocated for the async client structure and their members.
8688 */
8689static void
8690free_async_client_config(mdi_async_client_config_t *acc)
8691{
8692	if (acc->acc_phclient_path_list_head)
8693		free_phclient_path_list(acc->acc_phclient_path_list_head);
8694	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8695	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8696	kmem_free(acc, sizeof (*acc));
8697}
8698
8699/*
8700 * Sort vhcache pathinfos (cpis) of the specified client.
8701 * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8702 * flag set come at the beginning of the list. All cpis which have this
8703 * flag set come at the end of the list.
8704 */
8705static void
8706sort_vhcache_paths(mdi_vhcache_client_t *cct)
8707{
8708	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8709
8710	cpi_head = cct->cct_cpi_head;
8711	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8712	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8713		cpi_next = cpi->cpi_next;
8714		enqueue_vhcache_pathinfo(cct, cpi);
8715	}
8716}
8717
8718/*
8719 * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8720 * every vhcache pathinfo of the specified client. If not adjust the flag
8721 * setting appropriately.
8722 *
8723 * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8724 * on-disk vhci cache. So every time this flag is updated the cache must be
8725 * flushed.
8726 */
8727static void
8728adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8729    mdi_vhcache_lookup_token_t *tok)
8730{
8731	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8732	mdi_vhcache_client_t *cct;
8733	mdi_vhcache_pathinfo_t *cpi;
8734
8735	rw_enter(&vhcache->vhcache_lock, RW_READER);
8736	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8737	    == NULL) {
8738		rw_exit(&vhcache->vhcache_lock);
8739		return;
8740	}
8741
8742	/*
8743	 * to avoid unnecessary on-disk cache updates, first check if an
8744	 * update is really needed. If no update is needed simply return.
8745	 */
8746	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8747		if ((cpi->cpi_pip != NULL &&
8748		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8749		    (cpi->cpi_pip == NULL &&
8750		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8751			break;
8752		}
8753	}
8754	if (cpi == NULL) {
8755		rw_exit(&vhcache->vhcache_lock);
8756		return;
8757	}
8758
8759	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8760		rw_exit(&vhcache->vhcache_lock);
8761		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8762		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8763		    tok)) == NULL) {
8764			rw_exit(&vhcache->vhcache_lock);
8765			return;
8766		}
8767	}
8768
8769	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8770		if (cpi->cpi_pip != NULL)
8771			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8772		else
8773			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8774	}
8775	sort_vhcache_paths(cct);
8776
8777	rw_exit(&vhcache->vhcache_lock);
8778	vhcache_dirty(vhc);
8779}
8780
8781/*
8782 * Configure all specified paths of the client.
8783 */
8784static void
8785config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8786    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8787{
8788	mdi_phys_path_t *pp;
8789
8790	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8791		(void) bus_config_one_phci_child(pp->phys_path);
8792	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8793}
8794
8795/*
8796 * Dequeue elements from vhci async client config list and bus configure
8797 * their corresponding phci clients.
8798 */
8799static void
8800config_client_paths_thread(void *arg)
8801{
8802	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8803	mdi_async_client_config_t *acc;
8804	clock_t quit_at_ticks;
8805	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8806	callb_cpr_t cprinfo;
8807
8808	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8809	    "mdi_config_client_paths");
8810
8811	for (; ; ) {
8812		quit_at_ticks = ddi_get_lbolt() + idle_time;
8813
8814		mutex_enter(&vhc->vhc_lock);
8815		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8816		    vhc->vhc_acc_list_head == NULL &&
8817		    ddi_get_lbolt() < quit_at_ticks) {
8818			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8819			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8820			    quit_at_ticks);
8821			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8822		}
8823
8824		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8825		    vhc->vhc_acc_list_head == NULL)
8826			goto out;
8827
8828		acc = vhc->vhc_acc_list_head;
8829		vhc->vhc_acc_list_head = acc->acc_next;
8830		if (vhc->vhc_acc_list_head == NULL)
8831			vhc->vhc_acc_list_tail = NULL;
8832		vhc->vhc_acc_count--;
8833		mutex_exit(&vhc->vhc_lock);
8834
8835		config_client_paths_sync(vhc, acc->acc_ct_name,
8836		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8837		    &acc->acc_token);
8838
8839		free_async_client_config(acc);
8840	}
8841
8842out:
8843	vhc->vhc_acc_thrcount--;
8844	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8845	CALLB_CPR_EXIT(&cprinfo);
8846}
8847
8848/*
8849 * Arrange for all the phci client paths (pp_head) for the specified client
8850 * to be bus configured asynchronously by a thread.
8851 */
8852static void
8853config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8854    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8855{
8856	mdi_async_client_config_t *acc, *newacc;
8857	int create_thread;
8858
8859	if (pp_head == NULL)
8860		return;
8861
8862	if (mdi_mtc_off) {
8863		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8864		free_phclient_path_list(pp_head);
8865		return;
8866	}
8867
8868	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8869	ASSERT(newacc);
8870
8871	mutex_enter(&vhc->vhc_lock);
8872	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8873		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8874		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8875			free_async_client_config(newacc);
8876			mutex_exit(&vhc->vhc_lock);
8877			return;
8878		}
8879	}
8880
8881	if (vhc->vhc_acc_list_head == NULL)
8882		vhc->vhc_acc_list_head = newacc;
8883	else
8884		vhc->vhc_acc_list_tail->acc_next = newacc;
8885	vhc->vhc_acc_list_tail = newacc;
8886	vhc->vhc_acc_count++;
8887	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8888		cv_broadcast(&vhc->vhc_cv);
8889		create_thread = 0;
8890	} else {
8891		vhc->vhc_acc_thrcount++;
8892		create_thread = 1;
8893	}
8894	mutex_exit(&vhc->vhc_lock);
8895
8896	if (create_thread)
8897		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8898		    0, &p0, TS_RUN, minclsyspri);
8899}
8900
8901/*
8902 * Return number of online paths for the specified client.
8903 */
8904static int
8905nonline_paths(mdi_vhcache_client_t *cct)
8906{
8907	mdi_vhcache_pathinfo_t *cpi;
8908	int online_count = 0;
8909
8910	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8911		if (cpi->cpi_pip != NULL) {
8912			MDI_PI_LOCK(cpi->cpi_pip);
8913			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8914				online_count++;
8915			MDI_PI_UNLOCK(cpi->cpi_pip);
8916		}
8917	}
8918
8919	return (online_count);
8920}
8921
8922/*
8923 * Bus configure all paths for the specified vhci client.
8924 * If at least one path for the client is already online, the remaining paths
8925 * will be configured asynchronously. Otherwise, it synchronously configures
8926 * the paths until at least one path is online and then rest of the paths
8927 * will be configured asynchronously.
8928 */
8929static void
8930config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8931{
8932	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8933	mdi_phys_path_t *pp_head, *pp;
8934	mdi_vhcache_client_t *cct;
8935	mdi_vhcache_lookup_token_t tok;
8936
8937	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8938
8939	init_vhcache_lookup_token(&tok, NULL);
8940
8941	if (ct_name == NULL || ct_addr == NULL ||
8942	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8943	    == NULL ||
8944	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8945		rw_exit(&vhcache->vhcache_lock);
8946		return;
8947	}
8948
8949	/* if at least one path is online, configure the rest asynchronously */
8950	if (nonline_paths(cct) > 0) {
8951		rw_exit(&vhcache->vhcache_lock);
8952		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8953		return;
8954	}
8955
8956	rw_exit(&vhcache->vhcache_lock);
8957
8958	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8959		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8960			rw_enter(&vhcache->vhcache_lock, RW_READER);
8961
8962			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8963			    ct_addr, &tok)) == NULL) {
8964				rw_exit(&vhcache->vhcache_lock);
8965				goto out;
8966			}
8967
8968			if (nonline_paths(cct) > 0 &&
8969			    pp->phys_path_next != NULL) {
8970				rw_exit(&vhcache->vhcache_lock);
8971				config_client_paths_async(vhc, ct_name, ct_addr,
8972				    pp->phys_path_next, &tok);
8973				pp->phys_path_next = NULL;
8974				goto out;
8975			}
8976
8977			rw_exit(&vhcache->vhcache_lock);
8978		}
8979	}
8980
8981	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8982out:
8983	free_phclient_path_list(pp_head);
8984}
8985
8986static void
8987single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8988{
8989	mutex_enter(&vhc->vhc_lock);
8990	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8991		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8992	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8993	mutex_exit(&vhc->vhc_lock);
8994}
8995
8996static void
8997single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8998{
8999	mutex_enter(&vhc->vhc_lock);
9000	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
9001	cv_broadcast(&vhc->vhc_cv);
9002	mutex_exit(&vhc->vhc_lock);
9003}
9004
9005typedef struct mdi_phci_driver_info {
9006	char	*phdriver_name;	/* name of the phci driver */
9007
9008	/* set to non zero if the phci driver supports root device */
9009	int	phdriver_root_support;
9010} mdi_phci_driver_info_t;
9011
9012/*
9013 * vhci class and root support capability of a phci driver can be
9014 * specified using ddi-vhci-class and ddi-no-root-support properties in the
9015 * phci driver.conf file. The built-in tables below contain this information
9016 * for those phci drivers whose driver.conf files don't yet contain this info.
9017 *
9018 * All phci drivers expect iscsi have root device support.
9019 */
9020static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
9021	{ "fp", 1 },
9022	{ "iscsi", 0 },
9023	{ "ibsrp", 1 }
9024	};
9025
9026static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
9027
9028static void *
9029mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
9030{
9031	void *new_ptr;
9032
9033	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
9034	if (old_ptr) {
9035		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
9036		kmem_free(old_ptr, old_size);
9037	}
9038	return (new_ptr);
9039}
9040
9041static void
9042add_to_phci_list(char ***driver_list, int **root_support_list,
9043    int *cur_elements, int *max_elements, char *driver_name, int root_support)
9044{
9045	ASSERT(*cur_elements <= *max_elements);
9046	if (*cur_elements == *max_elements) {
9047		*max_elements += 10;
9048		*driver_list = mdi_realloc(*driver_list,
9049		    sizeof (char *) * (*cur_elements),
9050		    sizeof (char *) * (*max_elements));
9051		*root_support_list = mdi_realloc(*root_support_list,
9052		    sizeof (int) * (*cur_elements),
9053		    sizeof (int) * (*max_elements));
9054	}
9055	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
9056	(*root_support_list)[*cur_elements] = root_support;
9057	(*cur_elements)++;
9058}
9059
9060static void
9061get_phci_driver_list(char *vhci_class, char ***driver_list,
9062    int **root_support_list, int *cur_elements, int *max_elements)
9063{
9064	mdi_phci_driver_info_t	*st_driver_list, *p;
9065	int		st_ndrivers, root_support, i, j, driver_conf_count;
9066	major_t		m;
9067	struct devnames	*dnp;
9068	ddi_prop_t	*propp;
9069
9070	*driver_list = NULL;
9071	*root_support_list = NULL;
9072	*cur_elements = 0;
9073	*max_elements = 0;
9074
9075	/* add the phci drivers derived from the phci driver.conf files */
9076	for (m = 0; m < devcnt; m++) {
9077		dnp = &devnamesp[m];
9078
9079		if (dnp->dn_flags & DN_PHCI_DRIVER) {
9080			LOCK_DEV_OPS(&dnp->dn_lock);
9081			if (dnp->dn_global_prop_ptr != NULL &&
9082			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
9083			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
9084			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
9085			    strcmp(propp->prop_val, vhci_class) == 0) {
9086
9087				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
9088				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
9089				    &dnp->dn_global_prop_ptr->prop_list)
9090				    == NULL) ? 1 : 0;
9091
9092				add_to_phci_list(driver_list, root_support_list,
9093				    cur_elements, max_elements, dnp->dn_name,
9094				    root_support);
9095
9096				UNLOCK_DEV_OPS(&dnp->dn_lock);
9097			} else
9098				UNLOCK_DEV_OPS(&dnp->dn_lock);
9099		}
9100	}
9101
9102	driver_conf_count = *cur_elements;
9103
9104	/* add the phci drivers specified in the built-in tables */
9105	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
9106		st_driver_list = scsi_phci_driver_list;
9107		st_ndrivers = sizeof (scsi_phci_driver_list) /
9108		    sizeof (mdi_phci_driver_info_t);
9109	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
9110		st_driver_list = ib_phci_driver_list;
9111		st_ndrivers = sizeof (ib_phci_driver_list) /
9112		    sizeof (mdi_phci_driver_info_t);
9113	} else {
9114		st_driver_list = NULL;
9115		st_ndrivers = 0;
9116	}
9117
9118	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
9119		/* add this phci driver if not already added before */
9120		for (j = 0; j < driver_conf_count; j++) {
9121			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
9122				break;
9123		}
9124		if (j == driver_conf_count) {
9125			add_to_phci_list(driver_list, root_support_list,
9126			    cur_elements, max_elements, p->phdriver_name,
9127			    p->phdriver_root_support);
9128		}
9129	}
9130}
9131
9132/*
9133 * Attach the phci driver instances associated with the specified vhci class.
9134 * If root is mounted attach all phci driver instances.
9135 * If root is not mounted, attach the instances of only those phci
9136 * drivers that have the root support.
9137 */
9138static void
9139attach_phci_drivers(char *vhci_class)
9140{
9141	char	**driver_list, **p;
9142	int	*root_support_list;
9143	int	cur_elements, max_elements, i;
9144	major_t	m;
9145
9146	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9147	    &cur_elements, &max_elements);
9148
9149	for (i = 0; i < cur_elements; i++) {
9150		if (modrootloaded || root_support_list[i]) {
9151			m = ddi_name_to_major(driver_list[i]);
9152			if (m != DDI_MAJOR_T_NONE &&
9153			    ddi_hold_installed_driver(m))
9154				ddi_rele_driver(m);
9155		}
9156	}
9157
9158	if (driver_list) {
9159		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
9160			kmem_free(*p, strlen(*p) + 1);
9161		kmem_free(driver_list, sizeof (char *) * max_elements);
9162		kmem_free(root_support_list, sizeof (int) * max_elements);
9163	}
9164}
9165
9166/*
9167 * Build vhci cache:
9168 *
9169 * Attach phci driver instances and then drive BUS_CONFIG_ALL on
9170 * the phci driver instances. During this process the cache gets built.
9171 *
9172 * Cache is built fully if the root is mounted.
9173 * If the root is not mounted, phci drivers that do not have root support
9174 * are not attached. As a result the cache is built partially. The entries
9175 * in the cache reflect only those phci drivers that have root support.
9176 */
9177static int
9178build_vhci_cache(mdi_vhci_t *vh)
9179{
9180	mdi_vhci_config_t *vhc = vh->vh_config;
9181	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9182
9183	single_threaded_vhconfig_enter(vhc);
9184
9185	rw_enter(&vhcache->vhcache_lock, RW_READER);
9186	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
9187		rw_exit(&vhcache->vhcache_lock);
9188		single_threaded_vhconfig_exit(vhc);
9189		return (0);
9190	}
9191	rw_exit(&vhcache->vhcache_lock);
9192
9193	attach_phci_drivers(vh->vh_class);
9194	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
9195	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9196
9197	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9198	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
9199	rw_exit(&vhcache->vhcache_lock);
9200
9201	single_threaded_vhconfig_exit(vhc);
9202	vhcache_dirty(vhc);
9203	return (1);
9204}
9205
9206/*
9207 * Determine if discovery of paths is needed.
9208 */
9209static int
9210vhcache_do_discovery(mdi_vhci_config_t *vhc)
9211{
9212	int rv = 1;
9213
9214	mutex_enter(&vhc->vhc_lock);
9215	if (i_ddi_io_initialized() == 0) {
9216		if (vhc->vhc_path_discovery_boot > 0) {
9217			vhc->vhc_path_discovery_boot--;
9218			goto out;
9219		}
9220	} else {
9221		if (vhc->vhc_path_discovery_postboot > 0) {
9222			vhc->vhc_path_discovery_postboot--;
9223			goto out;
9224		}
9225	}
9226
9227	/*
9228	 * Do full path discovery at most once per mdi_path_discovery_interval.
9229	 * This is to avoid a series of full path discoveries when opening
9230	 * stale /dev/[r]dsk links.
9231	 */
9232	if (mdi_path_discovery_interval != -1 &&
9233	    ddi_get_lbolt64() >= vhc->vhc_path_discovery_cutoff_time)
9234		goto out;
9235
9236	rv = 0;
9237out:
9238	mutex_exit(&vhc->vhc_lock);
9239	return (rv);
9240}
9241
9242/*
9243 * Discover all paths:
9244 *
9245 * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
9246 * driver instances. During this process all paths will be discovered.
9247 */
9248static int
9249vhcache_discover_paths(mdi_vhci_t *vh)
9250{
9251	mdi_vhci_config_t *vhc = vh->vh_config;
9252	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9253	int rv = 0;
9254
9255	single_threaded_vhconfig_enter(vhc);
9256
9257	if (vhcache_do_discovery(vhc)) {
9258		attach_phci_drivers(vh->vh_class);
9259		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
9260		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9261
9262		mutex_enter(&vhc->vhc_lock);
9263		vhc->vhc_path_discovery_cutoff_time = ddi_get_lbolt64() +
9264		    mdi_path_discovery_interval * TICKS_PER_SECOND;
9265		mutex_exit(&vhc->vhc_lock);
9266		rv = 1;
9267	}
9268
9269	single_threaded_vhconfig_exit(vhc);
9270	return (rv);
9271}
9272
9273/*
9274 * Generic vhci bus config implementation:
9275 *
9276 * Parameters
9277 *	vdip	vhci dip
9278 *	flags	bus config flags
9279 *	op	bus config operation
9280 *	The remaining parameters are bus config operation specific
9281 *
9282 * for BUS_CONFIG_ONE
9283 *	arg	pointer to name@addr
9284 *	child	upon successful return from this function, *child will be
9285 *		set to the configured and held devinfo child node of vdip.
9286 *	ct_addr	pointer to client address (i.e. GUID)
9287 *
9288 * for BUS_CONFIG_DRIVER
9289 *	arg	major number of the driver
9290 *	child and ct_addr parameters are ignored
9291 *
9292 * for BUS_CONFIG_ALL
9293 *	arg, child, and ct_addr parameters are ignored
9294 *
9295 * Note that for the rest of the bus config operations, this function simply
9296 * calls the framework provided default bus config routine.
9297 */
9298int
9299mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
9300    void *arg, dev_info_t **child, char *ct_addr)
9301{
9302	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
9303	mdi_vhci_config_t *vhc = vh->vh_config;
9304	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9305	int rv = 0;
9306	int params_valid = 0;
9307	char *cp;
9308
9309	/*
9310	 * To bus config vhcis we relay operation, possibly using another
9311	 * thread, to phcis. The phci driver then interacts with MDI to cause
9312	 * vhci child nodes to be enumerated under the vhci node.  Adding a
9313	 * vhci child requires an ndi_devi_enter of the vhci. Since another
9314	 * thread may be adding the child, to avoid deadlock we can't wait
9315	 * for the relayed operations to complete if we have already entered
9316	 * the vhci node.
9317	 */
9318	if (DEVI_BUSY_OWNED(vdip)) {
9319		MDI_DEBUG(2, (MDI_NOTE, vdip,
9320		    "vhci dip is busy owned %p", (void *)vdip));
9321		goto default_bus_config;
9322	}
9323
9324	rw_enter(&vhcache->vhcache_lock, RW_READER);
9325	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
9326		rw_exit(&vhcache->vhcache_lock);
9327		rv = build_vhci_cache(vh);
9328		rw_enter(&vhcache->vhcache_lock, RW_READER);
9329	}
9330
9331	switch (op) {
9332	case BUS_CONFIG_ONE:
9333		if (arg != NULL && ct_addr != NULL) {
9334			/* extract node name */
9335			cp = (char *)arg;
9336			while (*cp != '\0' && *cp != '@')
9337				cp++;
9338			if (*cp == '@') {
9339				params_valid = 1;
9340				*cp = '\0';
9341				config_client_paths(vhc, (char *)arg, ct_addr);
9342				/* config_client_paths() releases cache_lock */
9343				*cp = '@';
9344				break;
9345			}
9346		}
9347
9348		rw_exit(&vhcache->vhcache_lock);
9349		break;
9350
9351	case BUS_CONFIG_DRIVER:
9352		rw_exit(&vhcache->vhcache_lock);
9353		if (rv == 0)
9354			st_bus_config_all_phcis(vhc, flags, op,
9355			    (major_t)(uintptr_t)arg);
9356		break;
9357
9358	case BUS_CONFIG_ALL:
9359		rw_exit(&vhcache->vhcache_lock);
9360		if (rv == 0)
9361			st_bus_config_all_phcis(vhc, flags, op, -1);
9362		break;
9363
9364	default:
9365		rw_exit(&vhcache->vhcache_lock);
9366		break;
9367	}
9368
9369
9370default_bus_config:
9371	/*
9372	 * All requested child nodes are enumerated under the vhci.
9373	 * Now configure them.
9374	 */
9375	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9376	    NDI_SUCCESS) {
9377		return (MDI_SUCCESS);
9378	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9379		/* discover all paths and try configuring again */
9380		if (vhcache_discover_paths(vh) &&
9381		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9382		    NDI_SUCCESS)
9383			return (MDI_SUCCESS);
9384	}
9385
9386	return (MDI_FAILURE);
9387}
9388
9389/*
9390 * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9391 */
9392static nvlist_t *
9393read_on_disk_vhci_cache(char *vhci_class)
9394{
9395	nvlist_t *nvl;
9396	int err;
9397	char *filename;
9398
9399	filename = vhclass2vhcache_filename(vhci_class);
9400
9401	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9402		kmem_free(filename, strlen(filename) + 1);
9403		return (nvl);
9404	} else if (err == EIO)
9405		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
9406	else if (err == EINVAL)
9407		cmn_err(CE_WARN,
9408		    "%s: data file corrupted, will recreate", filename);
9409
9410	kmem_free(filename, strlen(filename) + 1);
9411	return (NULL);
9412}
9413
9414/*
9415 * Read on-disk vhci cache into nvlists for all vhci classes.
9416 * Called during booting by i_ddi_read_devices_files().
9417 */
9418void
9419mdi_read_devices_files(void)
9420{
9421	int i;
9422
9423	for (i = 0; i < N_VHCI_CLASSES; i++)
9424		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9425}
9426
9427/*
9428 * Remove all stale entries from vhci cache.
9429 */
9430static void
9431clean_vhcache(mdi_vhci_config_t *vhc)
9432{
9433	mdi_vhci_cache_t	*vhcache = &vhc->vhc_vhcache;
9434	mdi_vhcache_phci_t	*phci, *nxt_phci;
9435	mdi_vhcache_client_t	*client, *nxt_client;
9436	mdi_vhcache_pathinfo_t	*path, *nxt_path;
9437
9438	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9439
9440	client = vhcache->vhcache_client_head;
9441	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9442	for ( ; client != NULL; client = nxt_client) {
9443		nxt_client = client->cct_next;
9444
9445		path = client->cct_cpi_head;
9446		client->cct_cpi_head = client->cct_cpi_tail = NULL;
9447		for ( ; path != NULL; path = nxt_path) {
9448			nxt_path = path->cpi_next;
9449			if ((path->cpi_cphci->cphci_phci != NULL) &&
9450			    (path->cpi_pip != NULL)) {
9451				enqueue_tail_vhcache_pathinfo(client, path);
9452			} else if (path->cpi_pip != NULL) {
9453				/* Not valid to have a path without a phci. */
9454				free_vhcache_pathinfo(path);
9455			}
9456		}
9457
9458		if (client->cct_cpi_head != NULL)
9459			enqueue_vhcache_client(vhcache, client);
9460		else {
9461			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9462			    (mod_hash_key_t)client->cct_name_addr);
9463			free_vhcache_client(client);
9464		}
9465	}
9466
9467	phci = vhcache->vhcache_phci_head;
9468	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9469	for ( ; phci != NULL; phci = nxt_phci) {
9470
9471		nxt_phci = phci->cphci_next;
9472		if (phci->cphci_phci != NULL)
9473			enqueue_vhcache_phci(vhcache, phci);
9474		else
9475			free_vhcache_phci(phci);
9476	}
9477
9478	vhcache->vhcache_clean_time = ddi_get_lbolt64();
9479	rw_exit(&vhcache->vhcache_lock);
9480	vhcache_dirty(vhc);
9481}
9482
9483/*
9484 * Remove all stale entries from vhci cache.
9485 * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9486 */
9487void
9488mdi_clean_vhcache(void)
9489{
9490	mdi_vhci_t *vh;
9491
9492	mutex_enter(&mdi_mutex);
9493	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9494		vh->vh_refcnt++;
9495		mutex_exit(&mdi_mutex);
9496		clean_vhcache(vh->vh_config);
9497		mutex_enter(&mdi_mutex);
9498		vh->vh_refcnt--;
9499	}
9500	mutex_exit(&mdi_mutex);
9501}
9502
9503/*
9504 * mdi_vhci_walk_clients():
9505 *		Walker routine to traverse client dev_info nodes
9506 * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9507 * below the client, including nexus devices, which we dont want.
9508 * So we just traverse the immediate siblings, starting from 1st client.
9509 */
9510void
9511mdi_vhci_walk_clients(dev_info_t *vdip,
9512    int (*f)(dev_info_t *, void *), void *arg)
9513{
9514	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9515	dev_info_t	*cdip;
9516	mdi_client_t	*ct;
9517
9518	MDI_VHCI_CLIENT_LOCK(vh);
9519	cdip = ddi_get_child(vdip);
9520	while (cdip) {
9521		ct = i_devi_get_client(cdip);
9522		MDI_CLIENT_LOCK(ct);
9523
9524		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9525			cdip = ddi_get_next_sibling(cdip);
9526		else
9527			cdip = NULL;
9528
9529		MDI_CLIENT_UNLOCK(ct);
9530	}
9531	MDI_VHCI_CLIENT_UNLOCK(vh);
9532}
9533
9534/*
9535 * mdi_vhci_walk_phcis():
9536 *		Walker routine to traverse phci dev_info nodes
9537 */
9538void
9539mdi_vhci_walk_phcis(dev_info_t *vdip,
9540    int (*f)(dev_info_t *, void *), void *arg)
9541{
9542	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9543	mdi_phci_t	*ph, *next;
9544
9545	MDI_VHCI_PHCI_LOCK(vh);
9546	ph = vh->vh_phci_head;
9547	while (ph) {
9548		MDI_PHCI_LOCK(ph);
9549
9550		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9551			next = ph->ph_next;
9552		else
9553			next = NULL;
9554
9555		MDI_PHCI_UNLOCK(ph);
9556		ph = next;
9557	}
9558	MDI_VHCI_PHCI_UNLOCK(vh);
9559}
9560
9561
9562/*
9563 * mdi_walk_vhcis():
9564 *		Walker routine to traverse vhci dev_info nodes
9565 */
9566void
9567mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9568{
9569	mdi_vhci_t	*vh = NULL;
9570
9571	mutex_enter(&mdi_mutex);
9572	/*
9573	 * Scan for already registered vhci
9574	 */
9575	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9576		vh->vh_refcnt++;
9577		mutex_exit(&mdi_mutex);
9578		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9579			mutex_enter(&mdi_mutex);
9580			vh->vh_refcnt--;
9581			break;
9582		} else {
9583			mutex_enter(&mdi_mutex);
9584			vh->vh_refcnt--;
9585		}
9586	}
9587
9588	mutex_exit(&mdi_mutex);
9589}
9590
9591/*
9592 * i_mdi_log_sysevent():
9593 *		Logs events for pickup by syseventd
9594 */
9595static void
9596i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9597{
9598	char		*path_name;
9599	nvlist_t	*attr_list;
9600
9601	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9602	    KM_SLEEP) != DDI_SUCCESS) {
9603		goto alloc_failed;
9604	}
9605
9606	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9607	(void) ddi_pathname(dip, path_name);
9608
9609	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9610	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9611		goto error;
9612	}
9613
9614	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9615	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9616		goto error;
9617	}
9618
9619	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9620	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9621		goto error;
9622	}
9623
9624	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9625	    path_name) != DDI_SUCCESS) {
9626		goto error;
9627	}
9628
9629	if (nvlist_add_string(attr_list, DDI_CLASS,
9630	    ph_vh_class) != DDI_SUCCESS) {
9631		goto error;
9632	}
9633
9634	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9635	    attr_list, NULL, DDI_SLEEP);
9636
9637error:
9638	kmem_free(path_name, MAXPATHLEN);
9639	nvlist_free(attr_list);
9640	return;
9641
9642alloc_failed:
9643	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
9644}
9645
9646char **
9647mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9648{
9649	char	**driver_list, **ret_driver_list = NULL;
9650	int	*root_support_list;
9651	int	cur_elements, max_elements;
9652
9653	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9654	    &cur_elements, &max_elements);
9655
9656
9657	if (driver_list) {
9658		kmem_free(root_support_list, sizeof (int) * max_elements);
9659		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9660		    * max_elements, sizeof (char *) * cur_elements);
9661	}
9662	*ndrivers = cur_elements;
9663
9664	return (ret_driver_list);
9665
9666}
9667
9668void
9669mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9670{
9671	char	**p;
9672	int	i;
9673
9674	if (driver_list) {
9675		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9676			kmem_free(*p, strlen(*p) + 1);
9677		kmem_free(driver_list, sizeof (char *) * ndrivers);
9678	}
9679}
9680
9681/*
9682 * mdi_is_dev_supported():
9683 *		function called by pHCI bus config operation to determine if a
9684 *		device should be represented as a child of the vHCI or the
9685 *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9686 *		information passed by the pHCI - specifics of the cinfo
9687 *		representation are by agreement between the pHCI and vHCI.
9688 * Return Values:
9689 *		MDI_SUCCESS
9690 *		MDI_FAILURE
9691 */
9692int
9693mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9694{
9695	mdi_vhci_t	*vh;
9696
9697	ASSERT(class && pdip);
9698
9699	/*
9700	 * For dev_supported, mdi_phci_register() must have established pdip as
9701	 * a pHCI.
9702	 *
9703	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9704	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9705	 */
9706	if (!MDI_PHCI(pdip))
9707		return (MDI_FAILURE);
9708
9709	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9710	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9711	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9712		return (MDI_FAILURE);
9713	}
9714
9715	/* Return vHCI answer */
9716	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9717}
9718
9719int
9720mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9721{
9722	uint_t devstate = 0;
9723	dev_info_t *cdip;
9724
9725	if ((pip == NULL) || (dcp == NULL))
9726		return (MDI_FAILURE);
9727
9728	cdip = mdi_pi_get_client(pip);
9729
9730	switch (mdi_pi_get_state(pip)) {
9731	case MDI_PATHINFO_STATE_INIT:
9732		devstate = DEVICE_DOWN;
9733		break;
9734	case MDI_PATHINFO_STATE_ONLINE:
9735		devstate = DEVICE_ONLINE;
9736		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9737			devstate |= DEVICE_BUSY;
9738		break;
9739	case MDI_PATHINFO_STATE_STANDBY:
9740		devstate = DEVICE_ONLINE;
9741		break;
9742	case MDI_PATHINFO_STATE_FAULT:
9743		devstate = DEVICE_DOWN;
9744		break;
9745	case MDI_PATHINFO_STATE_OFFLINE:
9746		devstate = DEVICE_OFFLINE;
9747		break;
9748	default:
9749		ASSERT(MDI_PI(pip)->pi_state);
9750	}
9751
9752	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9753		return (MDI_FAILURE);
9754
9755	return (MDI_SUCCESS);
9756}
9757